Skip to main content

gtars_core/
utils.rs

1use std::collections::HashMap;
2use std::ffi::OsStr;
3use std::fs::File;
4use std::io::prelude::*;
5#[cfg(feature = "http")]
6use std::io::Cursor;
7use std::io::{BufRead, BufReader};
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10
11use anyhow::{Context, Result};
12#[cfg(feature = "http")]
13use flate2::read::GzDecoder;
14use flate2::read::MultiGzDecoder;
15use std::error::Error;
16#[cfg(feature = "http")]
17use ureq::{get, Error as UreqError};
18
19use crate::models::region::Region;
20
21#[derive(Debug, Clone)]
22#[allow(clippy::upper_case_acronyms)]
23pub enum FileType {
24    BED,
25    BAM,
26    NARROWPEAK,
27    UNKNOWN, // Add an UNKNOWN variant for unhandled types
28}
29
30impl FromStr for FileType {
31    type Err = String;
32
33    fn from_str(s: &str) -> Result<Self, Self::Err> {
34        match s.to_lowercase().as_str() {
35            "bed" => Ok(FileType::BED),
36            "bam" => Ok(FileType::BAM),
37            "narrowpeak" => Ok(FileType::NARROWPEAK),
38            _ => Ok(FileType::UNKNOWN), // Return UNKNOWN for unhandled types
39                                        //_ => Err(format!("Invalid file type: {}", s)),
40        }
41    }
42}
43
44pub struct FileInfo {
45    pub file_type: FileType,
46    pub is_gzipped: bool,
47}
48
49pub fn get_file_info(path: &Path) -> FileInfo {
50    let mut file_type = FileType::UNKNOWN;
51    let mut is_gzipped = false;
52
53    if let Some(os_str_filename) = path.file_name() {
54        if let Some(filename) = os_str_filename.to_str() {
55            // Check for .gz first
56            if filename.ends_with(".gz") {
57                is_gzipped = true;
58                if let Some(base_filename) = filename.strip_suffix(".gz") {
59                    // Try to get the extension before .gz
60                    if let Some(ext) = PathBuf::from(base_filename)
61                        .extension()
62                        .and_then(|e| e.to_str())
63                    {
64                        file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
65                    } else {
66                        // If there's no extension before .gz (e.g., "my_data.gz"),
67                        // you might want to handle this specifically or leave as UNKNOWN.
68                        // For now, we'll try to parse the whole base_filename as a type
69                        file_type = FileType::from_str(base_filename).unwrap_or(FileType::UNKNOWN);
70                    }
71                }
72            } else {
73                // Not gzipped, just get the direct extension
74                if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
75                    file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
76                }
77            }
78        }
79    }
80
81    FileInfo {
82        file_type,
83        is_gzipped,
84    }
85}
86
87/// Parses each line of given bed like file into a contig (chromosome), starts and ends
88/// This ignores any other columns beyond start and ends.
89pub fn parse_bedlike_file(line: &str) -> Option<(String, i32, i32)> {
90    let mut fields = line.split('\t');
91    // Get the first field which should be chromosome.
92    let ctg = fields.next()?;
93    // Parse 2nd and 3rd string as integers or return -1 if failure
94    let st = fields
95        .next()
96        .and_then(|s| s.parse::<i32>().ok())
97        .unwrap_or(-1);
98    let en = fields
99        .next()
100        .and_then(|s| s.parse::<i32>().ok())
101        .unwrap_or(-1);
102
103    // Original code had a remainder of the line, r, but it does not appear to have been used
104    // in any way
105
106    Some((ctg.parse().unwrap(), st, en))
107}
108
109///
110/// Get a reader for either a gzip'd or non-gzip'd file.
111///
112/// # Arguments
113///
114/// - path: path to the file to read
115///
116pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
117    let is_gzipped = path.extension() == Some(OsStr::new("gz"));
118    let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
119    let file: Box<dyn Read> = match is_gzipped {
120        true => Box::new(MultiGzDecoder::new(file)),
121        false => Box::new(file),
122    };
123
124    let reader = BufReader::new(file);
125
126    Ok(reader)
127}
128
129///
130/// Get a reader for url ling. Either for gzip'd or non-gzip'd file
131///
132/// # Arguments
133///
134/// - path: path to the file to read
135///
136#[cfg(feature = "http")]
137pub fn get_dynamic_reader_from_url(
138    url: &Path,
139) -> Result<BufReader<Box<dyn std::io::Read>>, Box<dyn Error>> {
140    let mut url_str = url
141        .to_str()
142        .ok_or_else(|| "URL path is not valid UTF-8")?
143        .to_string();
144
145    let is_ftp = url_str.starts_with("ftp://");
146    if is_ftp {
147        println!("ftp is not fully implemented. Bugs could appear");
148        url_str = url_str.replacen("ftp://", "https://", 1);
149    }
150
151    // Perform request
152    let response = match get(&url_str).call() {
153        Ok(resp) => resp,
154        Err(UreqError::StatusCode(code)) => {
155            return Err(format!("HTTP status {} when fetching {}", code, url_str).into())
156        }
157        Err(e) => return Err(format!("Request error when fetching {}: {}", url_str, e).into()),
158    };
159
160    // Read the entire HTTP response body into memory as a Vec<u8>
161    let mut bytes = Vec::new();
162    response
163        .into_body()
164        .into_reader()
165        .read_to_end(&mut bytes)
166        .map_err(|e| format!("Failed reading response body from {}: {}", url_str, e))?;
167
168    let cursor = Cursor::new(bytes);
169
170    let is_gzipped = url_str.ends_with(".gz");
171
172    let reader: Box<dyn std::io::Read> = match is_gzipped {
173        true => Box::new(GzDecoder::new(cursor)),
174        false => Box::new(cursor),
175    };
176
177    Ok(BufReader::new(reader))
178}
179
180/// Get a reader for either a gzipped, non-gzipped file, or stdin
181///
182/// # Arguments
183///
184/// - file_path: path to the file to read, or '-' for stdin
185///
186/// # Returns
187///
188/// A `BufReader` object for a given file path or stdin.
189pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
190    if file_path_str == "-" {
191        Ok(BufReader::new(Box::new(std::io::stdin()) as Box<dyn Read>))
192    } else {
193        let file_path = Path::new(file_path_str);
194        get_dynamic_reader(file_path)
195    }
196}
197
198///
199/// Create a region-to-id hash-map from a list of regions
200///
201/// # Arguments:
202/// - regions: vec![] of [Region] structs
203pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
204    let mut current_id = 0;
205    let mut region_to_id: HashMap<Region, u32> = HashMap::new();
206    for region in regions.iter() {
207        region_to_id.entry(region.to_owned()).or_insert_with(|| {
208            let old_id = current_id;
209            current_id += 1;
210            old_id
211        });
212    }
213
214    region_to_id
215}
216
217///
218/// Generate an id-to-region hash-map from a list of regions
219///
220/// # Arguments:
221/// - regions: vec![] of [Region] structs
222pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
223    let mut current_id = 0;
224    let mut id_to_region: HashMap<u32, Region> = HashMap::new();
225
226    for region in regions.iter() {
227        id_to_region.entry(current_id).or_insert_with(|| {
228            current_id += 1;
229            region.clone()
230        });
231    }
232
233    id_to_region
234}
235
236///
237/// Create a region-to-id hash-map from a list of region strings
238///
239/// # Arguments:
240/// - regions: vec![] of region strings in the form `chr:start-end`
241pub fn generate_region_string_to_id_map(regions: &[String]) -> HashMap<String, u32> {
242    let mut current_id = 0;
243    let mut region_to_id: HashMap<String, u32> = HashMap::new();
244    for region in regions.iter() {
245        region_to_id.entry(region.to_owned()).or_insert_with(|| {
246            let old_id = current_id;
247            current_id += 1;
248            old_id
249        });
250    }
251
252    region_to_id
253}
254
255///
256/// Generate an id-to-region string hash-map from a list of region strings
257///
258/// # Arguments:
259/// - regions: vec![] of region strings in the form `chr:start-end`
260pub fn generate_id_to_region_string_map(regions: &[String]) -> HashMap<u32, String> {
261    let mut current_id = 0;
262    let mut id_to_region: HashMap<u32, String> = HashMap::new();
263
264    for region in regions.iter() {
265        id_to_region.entry(current_id).or_insert_with(|| {
266            current_id += 1;
267            region.clone()
268        });
269    }
270
271    id_to_region
272}
273
274pub fn get_chrom_sizes<T: AsRef<Path>>(path: T) -> HashMap<String, u32> {
275    let chrom_sizes_file = File::open(path.as_ref())
276        .with_context(|| "Failed to open chrom sizes file.")
277        .unwrap();
278
279    let mut chrom_sizes: HashMap<String, u32> = HashMap::new();
280
281    let file_buf = BufReader::new(chrom_sizes_file);
282
283    for line in file_buf.lines() {
284        let line_string: String = match line {
285            Ok(value) => value,
286            Err(_) => panic!("Error while reading chrom sizes file"),
287        };
288
289        let line_parts: Vec<String> = line_string
290            .split_whitespace()
291            .map(|s| s.to_string())
292            .collect();
293
294        chrom_sizes.insert(line_parts[0].clone(), line_parts[1].parse::<u32>().unwrap());
295    }
296
297    chrom_sizes
298}
299
300///
301/// Gen
302pub fn generate_ordering_map_for_universe_regions<T: AsRef<Path>>(
303    path: T,
304) -> Result<HashMap<Region, f64>> {
305    let mut map = HashMap::new();
306
307    let reader = get_dynamic_reader(path.as_ref())?;
308
309    for line in reader.lines() {
310        let line = line?;
311        let parts: Vec<&str> = line.split('\t').collect();
312
313        if parts.len() < 5 {
314            anyhow::bail!("BED file line does not have at least 5 fields: {}. It needs to have chr, start, end, name, and score.", line);
315        }
316
317        // parse the fields
318        let chr = parts[0];
319        let start = parts[1].parse::<u32>().with_context(|| {
320            format!("Failed to parse start position in BED file line: {}", line)
321        })?;
322
323        let end = parts[2]
324            .parse::<u32>()
325            .with_context(|| format!("Failed to parse end position in BED file line: {}", line))?;
326
327        let score = parts[4]
328            .parse::<f64>()
329            .with_context(|| format!("Failed to parse score in BED file line: {}", line))?;
330
331        let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
332
333        let region = Region {
334            chr: chr.to_owned(),
335            start,
336            end,
337            rest,
338        };
339
340        map.insert(region, score);
341    }
342
343    Ok(map)
344}
345
346pub fn read_bedset_file<P: AsRef<Path>>(file_path: P) -> Result<Vec<String>> {
347    let file = File::open(file_path)?;
348    let reader = BufReader::new(file);
349
350    let bed_identifiers = reader
351        .lines()
352        .map(|line| line.map(|s| s.trim().to_string()))
353        .collect::<Result<Vec<_>, _>>()?;
354
355    Ok(bed_identifiers)
356}
357
358pub fn remove_all_extensions(path: &Path) -> String {
359    let mut stem = path.file_stem().unwrap().to_string_lossy().to_string();
360
361    let mut parent_path = path.with_file_name(stem.clone());
362    while let Some(_extension) = parent_path.extension() {
363        // Remove the extension by recreating the path without it
364        parent_path = parent_path.with_extension("");
365        stem = parent_path
366            .file_stem()
367            .unwrap()
368            .to_string_lossy()
369            .to_string();
370    }
371
372    stem
373}