1use std::collections::HashMap;
2use std::ffi::OsStr;
3use std::fs::File;
4use std::io::prelude::*;
5#[cfg(feature = "http")]
6use std::io::Cursor;
7use std::io::{BufRead, BufReader};
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10
11use anyhow::{Context, Result};
12use flate2::read::MultiGzDecoder;
13#[cfg(feature = "http")]
14use std::error::Error;
15#[cfg(feature = "http")]
16use ureq::{get, Error as UreqError};
17
18use crate::models::region::Region;
19
20#[derive(Debug, Clone)]
21#[allow(clippy::upper_case_acronyms)]
22pub enum FileType {
23 BED,
24 BAM,
25 NARROWPEAK,
26 UNKNOWN, }
28
29impl FromStr for FileType {
30 type Err = String;
31
32 fn from_str(s: &str) -> Result<Self, Self::Err> {
33 match s.to_lowercase().as_str() {
34 "bed" => Ok(FileType::BED),
35 "bam" => Ok(FileType::BAM),
36 "narrowpeak" => Ok(FileType::NARROWPEAK),
37 _ => Ok(FileType::UNKNOWN), }
40 }
41}
42
43pub struct FileInfo {
44 pub file_type: FileType,
45 pub is_gzipped: bool,
46}
47
48pub fn get_file_info(path: &Path) -> FileInfo {
49 let mut file_type = FileType::UNKNOWN;
50 let mut is_gzipped = false;
51
52 if let Some(os_str_filename) = path.file_name() {
53 if let Some(filename) = os_str_filename.to_str() {
54 if filename.ends_with(".gz") {
56 is_gzipped = true;
57 if let Some(base_filename) = filename.strip_suffix(".gz") {
58 if let Some(ext) = PathBuf::from(base_filename)
60 .extension()
61 .and_then(|e| e.to_str())
62 {
63 file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
64 } else {
65 file_type = FileType::from_str(base_filename).unwrap_or(FileType::UNKNOWN);
69 }
70 }
71 } else {
72 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
74 file_type = FileType::from_str(ext).unwrap_or(FileType::UNKNOWN);
75 }
76 }
77 }
78 }
79
80 FileInfo {
81 file_type,
82 is_gzipped,
83 }
84}
85
86pub fn parse_bedlike_file(line: &str) -> Option<(String, i32, i32)> {
89 let mut fields = line.split('\t');
90 let ctg = fields.next()?;
92 let st = fields
94 .next()
95 .and_then(|s| s.parse::<i32>().ok())
96 .unwrap_or(-1);
97 let en = fields
98 .next()
99 .and_then(|s| s.parse::<i32>().ok())
100 .unwrap_or(-1);
101
102 Some((ctg.parse().unwrap(), st, en))
106}
107
108pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
116 let is_gzipped = path.extension() == Some(OsStr::new("gz"));
117 let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
118 let file: Box<dyn Read> = match is_gzipped {
119 true => Box::new(MultiGzDecoder::new(file)),
120 false => Box::new(file),
121 };
122
123 let reader = BufReader::new(file);
124
125 Ok(reader)
126}
127
128#[cfg(feature = "http")]
136pub fn get_dynamic_reader_from_url(
137 url: &Path,
138) -> Result<BufReader<Box<dyn std::io::Read>>, Box<dyn Error>> {
139 let mut url_str = url
140 .to_str()
141 .ok_or_else(|| "URL path is not valid UTF-8")?
142 .to_string();
143
144 let is_ftp = url_str.starts_with("ftp://");
145 if is_ftp {
146 println!("ftp is not fully implemented. Bugs could appear");
147 url_str = url_str.replacen("ftp://", "https://", 1);
148 }
149
150 let response = match get(&url_str).call() {
152 Ok(resp) => resp,
153 Err(UreqError::StatusCode(code)) => {
154 return Err(format!("HTTP status {} when fetching {}", code, url_str).into())
155 }
156 Err(e) => return Err(format!("Request error when fetching {}: {}", url_str, e).into()),
157 };
158
159 let mut bytes = Vec::new();
161 response
162 .into_body()
163 .into_reader()
164 .read_to_end(&mut bytes)
165 .map_err(|e| format!("Failed reading response body from {}: {}", url_str, e))?;
166
167 let cursor = Cursor::new(bytes);
168
169 let is_gzipped = url_str.ends_with(".gz");
170
171 let reader: Box<dyn std::io::Read> = match is_gzipped {
172 true => Box::new(MultiGzDecoder::new(cursor)),
173 false => Box::new(cursor),
174 };
175
176 Ok(BufReader::new(reader))
177}
178
179pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
189 if file_path_str == "-" {
190 Ok(BufReader::new(Box::new(std::io::stdin()) as Box<dyn Read>))
191 } else {
192 let file_path = Path::new(file_path_str);
193 get_dynamic_reader(file_path)
194 }
195}
196
197pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
203 let mut current_id = 0;
204 let mut region_to_id: HashMap<Region, u32> = HashMap::new();
205 for region in regions.iter() {
206 region_to_id.entry(region.to_owned()).or_insert_with(|| {
207 let old_id = current_id;
208 current_id += 1;
209 old_id
210 });
211 }
212
213 region_to_id
214}
215
216pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
222 let mut current_id = 0;
223 let mut id_to_region: HashMap<u32, Region> = HashMap::new();
224
225 for region in regions.iter() {
226 id_to_region.entry(current_id).or_insert_with(|| {
227 current_id += 1;
228 region.clone()
229 });
230 }
231
232 id_to_region
233}
234
235pub fn generate_region_string_to_id_map(regions: &[String]) -> HashMap<String, u32> {
241 let mut current_id = 0;
242 let mut region_to_id: HashMap<String, u32> = HashMap::new();
243 for region in regions.iter() {
244 region_to_id.entry(region.to_owned()).or_insert_with(|| {
245 let old_id = current_id;
246 current_id += 1;
247 old_id
248 });
249 }
250
251 region_to_id
252}
253
254pub fn generate_id_to_region_string_map(regions: &[String]) -> HashMap<u32, String> {
260 let mut current_id = 0;
261 let mut id_to_region: HashMap<u32, String> = HashMap::new();
262
263 for region in regions.iter() {
264 id_to_region.entry(current_id).or_insert_with(|| {
265 current_id += 1;
266 region.clone()
267 });
268 }
269
270 id_to_region
271}
272
273pub fn get_chrom_sizes<T: AsRef<Path>>(path: T) -> HashMap<String, u32> {
274 let chrom_sizes_file = File::open(path.as_ref())
275 .with_context(|| "Failed to open chrom sizes file.")
276 .unwrap();
277
278 let mut chrom_sizes: HashMap<String, u32> = HashMap::new();
279
280 let file_buf = BufReader::new(chrom_sizes_file);
281
282 for line in file_buf.lines() {
283 let line_string: String = match line {
284 Ok(value) => value,
285 Err(_) => panic!("Error while reading chrom sizes file"),
286 };
287
288 let line_parts: Vec<String> = line_string
289 .split_whitespace()
290 .map(|s| s.to_string())
291 .collect();
292
293 chrom_sizes.insert(line_parts[0].clone(), line_parts[1].parse::<u32>().unwrap());
294 }
295
296 chrom_sizes
297}
298
299pub fn generate_ordering_map_for_universe_regions<T: AsRef<Path>>(
302 path: T,
303) -> Result<HashMap<Region, f64>> {
304 let mut map = HashMap::new();
305
306 let reader = get_dynamic_reader(path.as_ref())?;
307
308 for line in reader.lines() {
309 let line = line?;
310 let parts: Vec<&str> = line.split('\t').collect();
311
312 if parts.len() < 5 {
313 anyhow::bail!("BED file line does not have at least 5 fields: {}. It needs to have chr, start, end, name, and score.", line);
314 }
315
316 let chr = parts[0];
318 let start = parts[1].parse::<u32>().with_context(|| {
319 format!("Failed to parse start position in BED file line: {}", line)
320 })?;
321
322 let end = parts[2]
323 .parse::<u32>()
324 .with_context(|| format!("Failed to parse end position in BED file line: {}", line))?;
325
326 let score = parts[4]
327 .parse::<f64>()
328 .with_context(|| format!("Failed to parse score in BED file line: {}", line))?;
329
330 let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
331
332 let region = Region {
333 chr: chr.to_owned(),
334 start,
335 end,
336 rest,
337 };
338
339 map.insert(region, score);
340 }
341
342 Ok(map)
343}
344
345pub fn read_bedset_file<P: AsRef<Path>>(file_path: P) -> Result<Vec<String>> {
346 let file = File::open(file_path)?;
347 let reader = BufReader::new(file);
348
349 let bed_identifiers = reader
350 .lines()
351 .map(|line| line.map(|s| s.trim().to_string()))
352 .collect::<Result<Vec<_>, _>>()?;
353
354 Ok(bed_identifiers)
355}
356
357pub fn chrom_karyotype_key(chr: &str) -> (u8, u32, String) {
360 let bare = chr.strip_prefix("chr").unwrap_or(chr);
361 match bare {
362 "X" => (1, 0, String::new()),
363 "Y" => (2, 0, String::new()),
364 "M" | "MT" => (3, 0, String::new()),
365 _ => match bare.parse::<u32>() {
366 Ok(n) => (0, n, String::new()),
367 Err(_) => (4, 0, bare.to_string()),
368 },
369 }
370}
371
372pub fn remove_all_extensions(path: &Path) -> String {
373 let mut stem = path.file_stem().unwrap().to_string_lossy().to_string();
374
375 let mut parent_path = path.with_file_name(stem.clone());
376 while let Some(_extension) = parent_path.extension() {
377 parent_path = parent_path.with_extension("");
379 stem = parent_path
380 .file_stem()
381 .unwrap()
382 .to_string_lossy()
383 .to_string();
384 }
385
386 stem
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392
393 #[test]
394 fn test_chrom_karyotype_sort_order() {
395 let mut chroms = vec!["chrM", "chrX", "chr2", "chr10", "chr1", "chrY", "chrUn_gl"];
396 chroms.sort_by_key(|c| chrom_karyotype_key(c));
397 assert_eq!(
398 chroms,
399 vec!["chr1", "chr2", "chr10", "chrX", "chrY", "chrM", "chrUn_gl"]
400 );
401 }
402
403 #[test]
404 fn test_chrom_karyotype_without_prefix() {
405 let mut chroms = vec!["MT", "X", "2", "1", "Y"];
407 chroms.sort_by_key(|c| chrom_karyotype_key(c));
408 assert_eq!(chroms, vec!["1", "2", "X", "Y", "MT"]);
409 }
410}