seq_here/utils.rs
1// use std::path::{Path, PathBuf};
2// use super::error::Result;
3//
4// pub fn validate_files(paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
5// if paths.is_empty() {
6// return Err("No input files provided.".into());
7// }
8//
9// let mut files = Vec::new();
10// for f in paths {
11//
12// let file = Path::new(f.as_);
13// if file.is_dir() {
14//
15// for e in file.read_dir()? {
16// let e = e?;
17// let path = e.path();
18// if path.is_file() {
19// files.push(path);
20// }
21// }
22// return Err(format!("Directory provided: {:?}", f).into());
23// }
24// if file.is_file() {
25// files.push(f);
26// }
27// }
28//
29// Ok(files)
30//
31// }
32
33use std::path::Path;
34
35// Get the sequence type from the file extension
36pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
37 let ext = file.extension().unwrap().to_str().unwrap();
38 match ext {
39 "fasta" | "fa" => Ok("fasta".to_string()),
40 "fastq" | "fq" => Ok("fastq".to_string()),
41 "gff" | "gtf" => Ok("gff".to_string()),
42 "bed" => Ok("bed".to_string()),
43 "sam" => Ok("sam".to_string()),
44 "bam" => Ok("bam".to_string()),
45 _ => Err(format!("Unknown file extension: {:?}", ext).into()),
46 }
47}
48
49// Check the sequence type by a fast way: see if some special symbols exist in the sequence
50pub fn try_seq_type_seq(seq: &[u8]) -> String {
51 if seq.is_empty() {
52 eprintln!("Empty sequence");
53 }
54
55 let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
56 for &c in seq {
57 let c_upper = c.to_ascii_uppercase();
58 let mut valid_in_any = false;
59
60 // Check DNA validity
61 if is_dna {
62 if matches!(c_upper, b'A' | b'T' | b'C' | b'G') {
63 valid_in_any = true;
64 } else {
65 is_dna = false;
66 }
67 }
68
69 // Check RNA validity
70 if is_rna {
71 if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
72 valid_in_any = true;
73 } else {
74 is_rna = false;
75 }
76 }
77
78 // Check Protein validity
79 if is_protein {
80 if matches!(
81 c_upper,
82 b'A' | b'R'
83 | b'N'
84 | b'D'
85 | b'C'
86 | b'E'
87 | b'Q'
88 | b'G'
89 | b'H'
90 | b'I'
91 | b'L'
92 | b'K'
93 | b'M'
94 | b'F'
95 | b'P'
96 | b'S'
97 | b'T'
98 | b'W'
99 | b'Y'
100 | b'V'
101 | b'B'
102 | b'J'
103 | b'O'
104 | b'U'
105 | b'X'
106 | b'Z'
107 ) {
108 valid_in_any = true;
109 } else {
110 is_protein = false;
111 }
112 }
113
114 // Early exit if invalid character
115 if !valid_in_any {
116 eprintln!("Invalid character: {}", c as char);
117 }
118 // Early exit if only one type is valid
119 if only_one_true(is_dna, is_rna, is_protein) {
120 break;
121 }
122 }
123
124 // Determine result by priority
125 if is_dna {
126 "DNA".into()
127 } else if is_rna {
128 "RNA".into()
129 } else if is_protein {
130 "Protein".into()
131 } else {
132 "Unknown sequence type".into()
133 }
134}
135
136fn only_one_true(a: bool, b: bool, c: bool) -> bool {
137 (a as u8 + b as u8 + c as u8) == 1
138}