gtars_core/
utils.rs

1use std::collections::HashMap;
2use std::ffi::OsStr;
3use std::fs::File;
4use std::io::prelude::*;
5use std::io::{BufRead, BufReader, Cursor};
6use std::path::{Path, PathBuf};
7use std::str::FromStr;
8
9use anyhow::{Context, Result};
10use flate2::read::{GzDecoder, MultiGzDecoder};
11use std::error::Error;
12#[cfg(feature = "http")]
13use ureq::{get, Error as UreqError};
14
15use crate::models::region::Region;
16
17#[derive(Debug, Clone)]
18#[allow(clippy::upper_case_acronyms)]
19pub enum FileType {
20    BED,
21    BAM,
22    NARROWPEAK,
23    UNKNOWN, // Add an UNKNOWN variant for unhandled types
24}
25
26impl FromStr for FileType {
27    type Err = String;
28
29    fn from_str(s: &str) -> Result<Self, Self::Err> {
30        match s.to_lowercase().as_str() {
31            "bed" => Ok(FileType::BED),
32            "bam" => Ok(FileType::BAM),
33            "narrowpeak" => Ok(FileType::NARROWPEAK),
34            _ => Ok(FileType::UNKNOWN), // Return UNKNOWN for unhandled types
35                                        //_ => Err(format!("Invalid file type: {}", s)),
36        }
37    }
38}
39
40pub struct FileInfo {
41    pub file_type: FileType,
42    pub is_gzipped: bool,
43}
44
45pub fn get_file_info(path: &Path) -> FileInfo {
46    let mut file_type = FileType::UNKNOWN;
47    let mut is_gzipped = false;
48
49    if let Some(os_str_filename) = path.file_name() {
50        if let Some(filename) = os_str_filename.to_str() {
51            // Check for .gz first
52            if filename.ends_with(".gz") {
53                is_gzipped = true;
54                if let Some(base_filename) = filename.strip_suffix(".gz") {
55                    // Try to get the extension before .gz
56                    if let Some(ext) = PathBuf::from(base_filename)
57                        .extension()
58                        .and_then(|e| e.to_str())
59                    {
60                        file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
61                    } else {
62                        // If there's no extension before .gz (e.g., "my_data.gz"),
63                        // you might want to handle this specifically or leave as UNKNOWN.
64                        // For now, we'll try to parse the whole base_filename as a type
65                        file_type = FileType::from_str(base_filename).unwrap_or(FileType::UNKNOWN);
66                    }
67                }
68            } else {
69                // Not gzipped, just get the direct extension
70                if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
71                    file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
72                }
73            }
74        }
75    }
76
77    FileInfo {
78        file_type,
79        is_gzipped,
80    }
81}
82
83/// Parses each line of given bed like file into a contig (chromosome), starts and ends
84/// This ignores any other columns beyond start and ends.
85pub fn parse_bedlike_file(line: &str) -> Option<(String, i32, i32)> {
86    let mut fields = line.split('\t');
87    // Get the first field which should be chromosome.
88    let ctg = fields.next()?;
89    // Parse 2nd and 3rd string as integers or return -1 if failure
90    let st = fields
91        .next()
92        .and_then(|s| s.parse::<i32>().ok())
93        .unwrap_or(-1);
94    let en = fields
95        .next()
96        .and_then(|s| s.parse::<i32>().ok())
97        .unwrap_or(-1);
98
99    // Original code had a remainder of the line, r, but it does not appear to have been used
100    // in any way
101
102    Some((ctg.parse().unwrap(), st, en))
103}
104
105///
106/// Get a reader for either a gzip'd or non-gzip'd file.
107///
108/// # Arguments
109///
110/// - path: path to the file to read
111///
112pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
113    let is_gzipped = path.extension() == Some(OsStr::new("gz"));
114    let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
115    let file: Box<dyn Read> = match is_gzipped {
116        true => Box::new(MultiGzDecoder::new(file)),
117        false => Box::new(file),
118    };
119
120    let reader = BufReader::new(file);
121
122    Ok(reader)
123}
124
125///
126/// Get a reader for url ling. Either for gzip'd or non-gzip'd file
127///
128/// # Arguments
129///
130/// - path: path to the file to read
131///
132#[cfg(feature = "http")]
133pub fn get_dynamic_reader_from_url(
134    url: &Path,
135) -> Result<BufReader<Box<dyn std::io::Read>>, Box<dyn Error>> {
136    let mut url_str = url
137        .to_str()
138        .ok_or_else(|| "URL path is not valid UTF-8")?
139        .to_string();
140
141    let is_ftp = url_str.starts_with("ftp://");
142    if is_ftp {
143        println!("ftp is not fully implemented. Bugs could appear");
144        url_str = url_str.replacen("ftp://", "http://", 1);
145    }
146
147    // Perform request
148    let response = match get(&url_str).call() {
149        Ok(resp) => resp,
150        Err(UreqError::StatusCode(code)) => {
151            return Err(format!("HTTP status {} when fetching {}", code, url_str).into())
152        }
153        Err(e) => return Err(format!("Request error when fetching {}: {}", url_str, e).into()),
154    };
155
156    // Read the entire HTTP response body into memory as a Vec<u8>
157    let mut bytes = Vec::new();
158    response
159        .into_body()
160        .into_reader()
161        .read_to_end(&mut bytes)
162        .map_err(|e| format!("Failed reading response body from {}: {}", url_str, e))?;
163
164    let cursor = Cursor::new(bytes);
165
166    let is_gzipped = url_str.ends_with(".gz");
167
168    let reader: Box<dyn std::io::Read> = match is_gzipped {
169        true => Box::new(GzDecoder::new(cursor)),
170        false => Box::new(cursor),
171    };
172
173    Ok(BufReader::new(reader))
174}
175
176/// Get a reader for either a gzipped, non-gzipped file, or stdin
177///
178/// # Arguments
179///
180/// - file_path: path to the file to read, or '-' for stdin
181///
182/// # Returns
183///
184/// A `BufReader` object for a given file path or stdin.
185pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
186    if file_path_str == "-" {
187        Ok(BufReader::new(Box::new(std::io::stdin()) as Box<dyn Read>))
188    } else {
189        let file_path = Path::new(file_path_str);
190        get_dynamic_reader(file_path)
191    }
192}
193
194///
195/// Create a region-to-id hash-map from a list of regions
196///
197/// # Arguments:
198/// - regions: vec![] of [Region] structs
199pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
200    let mut current_id = 0;
201    let mut region_to_id: HashMap<Region, u32> = HashMap::new();
202    for region in regions.iter() {
203        region_to_id.entry(region.to_owned()).or_insert_with(|| {
204            let old_id = current_id;
205            current_id += 1;
206            old_id
207        });
208    }
209
210    region_to_id
211}
212
213///
214/// Generate an id-to-region hash-map from a list of regions
215///
216/// # Arguments:
217/// - regions: vec![] of [Region] structs
218pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
219    let mut current_id = 0;
220    let mut id_to_region: HashMap<u32, Region> = HashMap::new();
221
222    for region in regions.iter() {
223        id_to_region.entry(current_id).or_insert_with(|| {
224            current_id += 1;
225            region.clone()
226        });
227    }
228
229    id_to_region
230}
231
232///
233/// Create a region-to-id hash-map from a list of region strings
234///
235/// # Arguments:
236/// - regions: vec![] of region strings in the form `chr:start-end`
237pub fn generate_region_string_to_id_map(regions: &[String]) -> HashMap<String, u32> {
238    let mut current_id = 0;
239    let mut region_to_id: HashMap<String, u32> = HashMap::new();
240    for region in regions.iter() {
241        region_to_id.entry(region.to_owned()).or_insert_with(|| {
242            let old_id = current_id;
243            current_id += 1;
244            old_id
245        });
246    }
247
248    region_to_id
249}
250
251///
252/// Generate an id-to-region string hash-map from a list of region strings
253///
254/// # Arguments:
255/// - regions: vec![] of region strings in the form `chr:start-end`
256pub fn generate_id_to_region_string_map(regions: &[String]) -> HashMap<u32, String> {
257    let mut current_id = 0;
258    let mut id_to_region: HashMap<u32, String> = HashMap::new();
259
260    for region in regions.iter() {
261        id_to_region.entry(current_id).or_insert_with(|| {
262            current_id += 1;
263            region.clone()
264        });
265    }
266
267    id_to_region
268}
269
270pub fn get_chrom_sizes<T: AsRef<Path>>(path: T) -> HashMap<String, u32> {
271    let chrom_sizes_file = File::open(path.as_ref())
272        .with_context(|| "Failed to open chrom sizes file.")
273        .unwrap();
274
275    let mut chrom_sizes: HashMap<String, u32> = HashMap::new();
276
277    let file_buf = BufReader::new(chrom_sizes_file);
278
279    for line in file_buf.lines() {
280        let line_string: String = match line {
281            Ok(value) => value,
282            Err(_) => panic!("Error while reading chrom sizes file"),
283        };
284
285        let line_parts: Vec<String> = line_string
286            .split_whitespace()
287            .map(|s| s.to_string())
288            .collect();
289
290        chrom_sizes.insert(line_parts[0].clone(), line_parts[1].parse::<u32>().unwrap());
291    }
292
293    chrom_sizes
294}
295
296///
297/// Gen
298pub fn generate_ordering_map_for_universe_regions<T: AsRef<Path>>(
299    path: T,
300) -> Result<HashMap<Region, f64>> {
301    let mut map = HashMap::new();
302
303    let reader = get_dynamic_reader(path.as_ref())?;
304
305    for line in reader.lines() {
306        let line = line?;
307        let parts: Vec<&str> = line.split('\t').collect();
308
309        if parts.len() < 5 {
310            anyhow::bail!("BED file line does not have at least 5 fields: {}. It needs to have chr, start, end, name, and score.", line);
311        }
312
313        // parse the fields
314        let chr = parts[0];
315        let start = parts[1].parse::<u32>().with_context(|| {
316            format!("Failed to parse start position in BED file line: {}", line)
317        })?;
318
319        let end = parts[2]
320            .parse::<u32>()
321            .with_context(|| format!("Failed to parse end position in BED file line: {}", line))?;
322
323        let score = parts[4]
324            .parse::<f64>()
325            .with_context(|| format!("Failed to parse score in BED file line: {}", line))?;
326
327        let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
328
329        let region = Region {
330            chr: chr.to_owned(),
331            start,
332            end,
333            rest,
334        };
335
336        map.insert(region, score);
337    }
338
339    Ok(map)
340}
341
342pub fn read_bedset_file<P: AsRef<Path>>(file_path: P) -> Result<Vec<String>> {
343    let file = File::open(file_path)?;
344    let reader = BufReader::new(file);
345
346    let bed_identifiers = reader
347        .lines()
348        .map(|line| line.map(|s| s.trim().to_string()))
349        .collect::<Result<Vec<_>, _>>()?;
350
351    Ok(bed_identifiers)
352}
353
354pub fn remove_all_extensions(path: &Path) -> String {
355    let mut stem = path.file_stem().unwrap().to_string_lossy().to_string();
356
357    let mut parent_path = path.with_file_name(stem.clone());
358    while let Some(_extension) = parent_path.extension() {
359        // Remove the extension by recreating the path without it
360        parent_path = parent_path.with_extension("");
361        stem = parent_path
362            .file_stem()
363            .unwrap()
364            .to_string_lossy()
365            .to_string();
366    }
367
368    stem
369}