Skip to main content

gtars_core/
utils.rs

1use std::collections::HashMap;
2use std::ffi::OsStr;
3use std::fs::File;
4use std::io::prelude::*;
5#[cfg(feature = "http")]
6use std::io::Cursor;
7use std::io::{BufRead, BufReader};
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10
11use anyhow::{Context, Result};
12use flate2::read::MultiGzDecoder;
13#[cfg(feature = "http")]
14use std::error::Error;
15#[cfg(feature = "http")]
16use ureq::{get, Error as UreqError};
17
18use crate::models::region::Region;
19
20#[derive(Debug, Clone)]
21#[allow(clippy::upper_case_acronyms)]
22pub enum FileType {
23    BED,
24    BAM,
25    NARROWPEAK,
26    UNKNOWN, // Add an UNKNOWN variant for unhandled types
27}
28
29impl FromStr for FileType {
30    type Err = String;
31
32    fn from_str(s: &str) -> Result<Self, Self::Err> {
33        match s.to_lowercase().as_str() {
34            "bed" => Ok(FileType::BED),
35            "bam" => Ok(FileType::BAM),
36            "narrowpeak" => Ok(FileType::NARROWPEAK),
37            _ => Ok(FileType::UNKNOWN), // Return UNKNOWN for unhandled types
38                                        //_ => Err(format!("Invalid file type: {}", s)),
39        }
40    }
41}
42
43pub struct FileInfo {
44    pub file_type: FileType,
45    pub is_gzipped: bool,
46}
47
48pub fn get_file_info(path: &Path) -> FileInfo {
49    let mut file_type = FileType::UNKNOWN;
50    let mut is_gzipped = false;
51
52    if let Some(os_str_filename) = path.file_name() {
53        if let Some(filename) = os_str_filename.to_str() {
54            // Check for .gz first
55            if filename.ends_with(".gz") {
56                is_gzipped = true;
57                if let Some(base_filename) = filename.strip_suffix(".gz") {
58                    // Try to get the extension before .gz
59                    if let Some(ext) = PathBuf::from(base_filename)
60                        .extension()
61                        .and_then(|e| e.to_str())
62                    {
63                        file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
64                    } else {
65                        // If there's no extension before .gz (e.g., "my_data.gz"),
66                        // you might want to handle this specifically or leave as UNKNOWN.
67                        // For now, we'll try to parse the whole base_filename as a type
68                        file_type = FileType::from_str(base_filename).unwrap_or(FileType::UNKNOWN);
69                    }
70                }
71            } else {
72                // Not gzipped, just get the direct extension
73                if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
74                    file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
75                }
76            }
77        }
78    }
79
80    FileInfo {
81        file_type,
82        is_gzipped,
83    }
84}
85
86/// Parses each line of given bed like file into a contig (chromosome), starts and ends
87/// This ignores any other columns beyond start and ends.
88pub fn parse_bedlike_file(line: &str) -> Option<(String, i32, i32)> {
89    let mut fields = line.split('\t');
90    // Get the first field which should be chromosome.
91    let ctg = fields.next()?;
92    // Parse 2nd and 3rd string as integers or return -1 if failure
93    let st = fields
94        .next()
95        .and_then(|s| s.parse::<i32>().ok())
96        .unwrap_or(-1);
97    let en = fields
98        .next()
99        .and_then(|s| s.parse::<i32>().ok())
100        .unwrap_or(-1);
101
102    // Original code had a remainder of the line, r, but it does not appear to have been used
103    // in any way
104
105    Some((ctg.parse().unwrap(), st, en))
106}
107
108///
109/// Get a reader for either a gzip'd or non-gzip'd file.
110///
111/// # Arguments
112///
113/// - path: path to the file to read
114///
115pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
116    let is_gzipped = path.extension() == Some(OsStr::new("gz"));
117    let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
118    let file: Box<dyn Read> = match is_gzipped {
119        true => Box::new(MultiGzDecoder::new(file)),
120        false => Box::new(file),
121    };
122
123    let reader = BufReader::new(file);
124
125    Ok(reader)
126}
127
128///
129/// Get a reader for url ling. Either for gzip'd or non-gzip'd file
130///
131/// # Arguments
132///
133/// - path: path to the file to read
134///
135#[cfg(feature = "http")]
136pub fn get_dynamic_reader_from_url(
137    url: &Path,
138) -> Result<BufReader<Box<dyn std::io::Read>>, Box<dyn Error>> {
139    let mut url_str = url
140        .to_str()
141        .ok_or_else(|| "URL path is not valid UTF-8")?
142        .to_string();
143
144    let is_ftp = url_str.starts_with("ftp://");
145    if is_ftp {
146        println!("ftp is not fully implemented. Bugs could appear");
147        url_str = url_str.replacen("ftp://", "https://", 1);
148    }
149
150    // Perform request
151    let response = match get(&url_str).call() {
152        Ok(resp) => resp,
153        Err(UreqError::StatusCode(code)) => {
154            return Err(format!("HTTP status {} when fetching {}", code, url_str).into())
155        }
156        Err(e) => return Err(format!("Request error when fetching {}: {}", url_str, e).into()),
157    };
158
159    // Read the entire HTTP response body into memory as a Vec<u8>
160    let mut bytes = Vec::new();
161    response
162        .into_body()
163        .into_reader()
164        .read_to_end(&mut bytes)
165        .map_err(|e| format!("Failed reading response body from {}: {}", url_str, e))?;
166
167    let cursor = Cursor::new(bytes);
168
169    let is_gzipped = url_str.ends_with(".gz");
170
171    let reader: Box<dyn std::io::Read> = match is_gzipped {
172        true => Box::new(MultiGzDecoder::new(cursor)),
173        false => Box::new(cursor),
174    };
175
176    Ok(BufReader::new(reader))
177}
178
179/// Get a reader for either a gzipped, non-gzipped file, or stdin
180///
181/// # Arguments
182///
183/// - file_path: path to the file to read, or '-' for stdin
184///
185/// # Returns
186///
187/// A `BufReader` object for a given file path or stdin.
188pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
189    if file_path_str == "-" {
190        Ok(BufReader::new(Box::new(std::io::stdin()) as Box<dyn Read>))
191    } else {
192        let file_path = Path::new(file_path_str);
193        get_dynamic_reader(file_path)
194    }
195}
196
197///
198/// Create a region-to-id hash-map from a list of regions
199///
200/// # Arguments:
201/// - regions: vec![] of [Region] structs
202pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
203    let mut current_id = 0;
204    let mut region_to_id: HashMap<Region, u32> = HashMap::new();
205    for region in regions.iter() {
206        region_to_id.entry(region.to_owned()).or_insert_with(|| {
207            let old_id = current_id;
208            current_id += 1;
209            old_id
210        });
211    }
212
213    region_to_id
214}
215
216///
217/// Generate an id-to-region hash-map from a list of regions
218///
219/// # Arguments:
220/// - regions: vec![] of [Region] structs
221pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
222    let mut current_id = 0;
223    let mut id_to_region: HashMap<u32, Region> = HashMap::new();
224
225    for region in regions.iter() {
226        id_to_region.entry(current_id).or_insert_with(|| {
227            current_id += 1;
228            region.clone()
229        });
230    }
231
232    id_to_region
233}
234
235///
236/// Create a region-to-id hash-map from a list of region strings
237///
238/// # Arguments:
239/// - regions: vec![] of region strings in the form `chr:start-end`
240pub fn generate_region_string_to_id_map(regions: &[String]) -> HashMap<String, u32> {
241    let mut current_id = 0;
242    let mut region_to_id: HashMap<String, u32> = HashMap::new();
243    for region in regions.iter() {
244        region_to_id.entry(region.to_owned()).or_insert_with(|| {
245            let old_id = current_id;
246            current_id += 1;
247            old_id
248        });
249    }
250
251    region_to_id
252}
253
254///
255/// Generate an id-to-region string hash-map from a list of region strings
256///
257/// # Arguments:
258/// - regions: vec![] of region strings in the form `chr:start-end`
259pub fn generate_id_to_region_string_map(regions: &[String]) -> HashMap<u32, String> {
260    let mut current_id = 0;
261    let mut id_to_region: HashMap<u32, String> = HashMap::new();
262
263    for region in regions.iter() {
264        id_to_region.entry(current_id).or_insert_with(|| {
265            current_id += 1;
266            region.clone()
267        });
268    }
269
270    id_to_region
271}
272
273pub fn get_chrom_sizes<T: AsRef<Path>>(path: T) -> HashMap<String, u32> {
274    let chrom_sizes_file = File::open(path.as_ref())
275        .with_context(|| "Failed to open chrom sizes file.")
276        .unwrap();
277
278    let mut chrom_sizes: HashMap<String, u32> = HashMap::new();
279
280    let file_buf = BufReader::new(chrom_sizes_file);
281
282    for line in file_buf.lines() {
283        let line_string: String = match line {
284            Ok(value) => value,
285            Err(_) => panic!("Error while reading chrom sizes file"),
286        };
287
288        let line_parts: Vec<String> = line_string
289            .split_whitespace()
290            .map(|s| s.to_string())
291            .collect();
292
293        chrom_sizes.insert(line_parts[0].clone(), line_parts[1].parse::<u32>().unwrap());
294    }
295
296    chrom_sizes
297}
298
299///
300/// Gen
301pub fn generate_ordering_map_for_universe_regions<T: AsRef<Path>>(
302    path: T,
303) -> Result<HashMap<Region, f64>> {
304    let mut map = HashMap::new();
305
306    let reader = get_dynamic_reader(path.as_ref())?;
307
308    for line in reader.lines() {
309        let line = line?;
310        let parts: Vec<&str> = line.split('\t').collect();
311
312        if parts.len() < 5 {
313            anyhow::bail!("BED file line does not have at least 5 fields: {}. It needs to have chr, start, end, name, and score.", line);
314        }
315
316        // parse the fields
317        let chr = parts[0];
318        let start = parts[1].parse::<u32>().with_context(|| {
319            format!("Failed to parse start position in BED file line: {}", line)
320        })?;
321
322        let end = parts[2]
323            .parse::<u32>()
324            .with_context(|| format!("Failed to parse end position in BED file line: {}", line))?;
325
326        let score = parts[4]
327            .parse::<f64>()
328            .with_context(|| format!("Failed to parse score in BED file line: {}", line))?;
329
330        let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
331
332        let region = Region {
333            chr: chr.to_owned(),
334            start,
335            end,
336            rest,
337        };
338
339        map.insert(region, score);
340    }
341
342    Ok(map)
343}
344
345pub fn read_bedset_file<P: AsRef<Path>>(file_path: P) -> Result<Vec<String>> {
346    let file = File::open(file_path)?;
347    let reader = BufReader::new(file);
348
349    let bed_identifiers = reader
350        .lines()
351        .map(|line| line.map(|s| s.trim().to_string()))
352        .collect::<Result<Vec<_>, _>>()?;
353
354    Ok(bed_identifiers)
355}
356
357/// Returns a sort key that orders chromosome names karyotypically:
358/// numeric (1, 2, …, 22) → X → Y → M/MT → everything else alphabetically.
359pub fn chrom_karyotype_key(chr: &str) -> (u8, u32, String) {
360    let bare = chr.strip_prefix("chr").unwrap_or(chr);
361    match bare {
362        "X" => (1, 0, String::new()),
363        "Y" => (2, 0, String::new()),
364        "M" | "MT" => (3, 0, String::new()),
365        _ => match bare.parse::<u32>() {
366            Ok(n) => (0, n, String::new()),
367            Err(_) => (4, 0, bare.to_string()),
368        },
369    }
370}
371
372pub fn remove_all_extensions(path: &Path) -> String {
373    let mut stem = path.file_stem().unwrap().to_string_lossy().to_string();
374
375    let mut parent_path = path.with_file_name(stem.clone());
376    while let Some(_extension) = parent_path.extension() {
377        // Remove the extension by recreating the path without it
378        parent_path = parent_path.with_extension("");
379        stem = parent_path
380            .file_stem()
381            .unwrap()
382            .to_string_lossy()
383            .to_string();
384    }
385
386    stem
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392
393    #[test]
394    fn test_chrom_karyotype_sort_order() {
395        let mut chroms = vec!["chrM", "chrX", "chr2", "chr10", "chr1", "chrY", "chrUn_gl"];
396        chroms.sort_by_key(|c| chrom_karyotype_key(c));
397        assert_eq!(
398            chroms,
399            vec!["chr1", "chr2", "chr10", "chrX", "chrY", "chrM", "chrUn_gl"]
400        );
401    }
402
403    #[test]
404    fn test_chrom_karyotype_without_prefix() {
405        // works without "chr" prefix
406        let mut chroms = vec!["MT", "X", "2", "1", "Y"];
407        chroms.sort_by_key(|c| chrom_karyotype_key(c));
408        assert_eq!(chroms, vec!["1", "2", "X", "Y", "MT"]);
409    }
410}