seq_here/utils.rs
1// use std::path::{Path, PathBuf};
2// use super::error::Result;
3//
4// pub fn validate_files(paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
5// if paths.is_empty() {
6// return Err("No input files provided.".into());
7// }
8//
9// let mut files = Vec::new();
10// for f in paths {
11//
12// let file = Path::new(f.as_);
13// if file.is_dir() {
14//
15// for e in file.read_dir()? {
16// let e = e?;
17// let path = e.path();
18// if path.is_file() {
19// files.push(path);
20// }
21// }
22// return Err(format!("Directory provided: {:?}", f).into());
23// }
24// if file.is_file() {
25// files.push(f);
26// }
27// }
28//
29// Ok(files)
30//
31// }
32
33use std::fs;
34use std::path::Path;
35
36/// Get the sequence type from the file extension
37pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
38 let ext = file.extension().unwrap().to_str().unwrap();
39 match ext {
40 "fasta" | "fa" => Ok("fasta".to_string()),
41 "fastq" | "fq" => Ok("fastq".to_string()),
42 "gff" | "gtf" => Ok("gff".to_string()),
43 "bed" => Ok("bed".to_string()),
44 "sam" => Ok("sam".to_string()),
45 "bam" => Ok("bam".to_string()),
46 _ => Err(format!("Unknown file extension: {:?}", ext).into()),
47 }
48}
49
50/// Check the sequence type by a fast way: see if some special symbols exist in the sequence
51pub fn try_seq_type_seq(seq: &[u8]) -> String {
52 if seq.is_empty() {
53 eprintln!("Empty sequence");
54 }
55
56 let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
57 for &c in seq {
58 let c_upper = c.to_ascii_uppercase();
59 let mut valid_in_any = false;
60
61 // Check DNA validity
62 if is_dna {
63 // Some files may contain 'N' as a placeholder for unknown bases
64 if matches!(c_upper, b'A' | b'T' | b'C' | b'G' | b'N') {
65 valid_in_any = true;
66 } else {
67 is_dna = false;
68 }
69 }
70
71 // Check RNA validity
72 if is_rna {
73 if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
74 valid_in_any = true;
75 } else {
76 is_rna = false;
77 }
78 }
79
80 // Check Protein validity
81 if is_protein {
82 if matches!(
83 c_upper,
84 b'A' | b'R'
85 | b'N'
86 | b'D'
87 | b'C'
88 | b'E'
89 | b'Q'
90 | b'G'
91 | b'H'
92 | b'I'
93 | b'L'
94 | b'K'
95 | b'M'
96 | b'F'
97 | b'P'
98 | b'S'
99 | b'T'
100 | b'W'
101 | b'Y'
102 | b'V'
103 | b'B'
104 | b'J'
105 | b'O'
106 | b'U'
107 | b'X'
108 | b'Z'
109 ) {
110 valid_in_any = true;
111 } else {
112 is_protein = false;
113 }
114 }
115
116 // Early exit if invalid character
117 if !valid_in_any {
118 eprintln!("Invalid character: {}", c as char);
119 }
120 // Early exit if only one type is valid
121 if only_one_true(is_dna, is_rna, is_protein) {
122 break;
123 }
124 }
125
126 // Determine result by priority
127 // if `is_dna && is_protein` equals to true, the sequence is seen as DNA.
128 if is_dna {
129 "DNA".into()
130 } else if is_rna {
131 "RNA".into()
132 } else if is_protein {
133 "Protein".into()
134 } else {
135 "Unknown sequence type".into()
136 }
137}
138
139fn only_one_true(a: bool, b: bool, c: bool) -> bool {
140 (a as u8 + b as u8 + c as u8) == 1
141}
142
143/// Write `content` into file given by `path`
144///
145pub fn write_file<P: AsRef<Path>>(path: P, content: &str) {
146 fs::write(path, content).expect("Unable to write file");
147}