seq_here/
utils.rs

1// use std::path::{Path, PathBuf};
2// use super::error::Result;
3//
4// pub fn validate_files(paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
5//     if paths.is_empty() {
6//         return Err("No input files provided.".into());
7//     }
8//
9//     let mut files = Vec::new();
10//     for f in paths {
11//
12//         let file = Path::new(f.as_);
13//         if file.is_dir() {
14//
15//             for e in file.read_dir()? {
16//                 let e = e?;
17//                 let path = e.path();
18//                 if path.is_file() {
19//                     files.push(path);
20//                 }
21//             }
22//             return Err(format!("Directory provided: {:?}", f).into());
23//         }
24//         if file.is_file() {
25//             files.push(f);
26//         }
27//     }
28//
29//     Ok(files)
30//
31// }
32
33use std::path::Path;
34
35// Get the sequence type from the file extension
36pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
37    let ext = file.extension().unwrap().to_str().unwrap();
38    match ext {
39        "fasta" | "fa" => Ok("fasta".to_string()),
40        "fastq" | "fq" => Ok("fastq".to_string()),
41        "gff" | "gtf" => Ok("gff".to_string()),
42        "bed" => Ok("bed".to_string()),
43        "sam" => Ok("sam".to_string()),
44        "bam" => Ok("bam".to_string()),
45        _ => Err(format!("Unknown file extension: {:?}", ext).into()),
46    }
47}
48
49// Check the sequence type by a fast way: see if some special symbols exist in the sequence
50pub fn try_seq_type_seq(seq: &[u8]) -> String {
51    if seq.is_empty() {
52        eprintln!("Empty sequence");
53    }
54
55    let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
56    for &c in seq {
57        let c_upper = c.to_ascii_uppercase();
58        let mut valid_in_any = false;
59
60        // Check DNA validity
61        if is_dna {
62            if matches!(c_upper, b'A' | b'T' | b'C' | b'G') {
63                valid_in_any = true;
64            } else {
65                is_dna = false;
66            }
67        }
68
69        // Check RNA validity
70        if is_rna {
71            if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
72                valid_in_any = true;
73            } else {
74                is_rna = false;
75            }
76        }
77
78        // Check Protein validity
79        if is_protein {
80            if matches!(
81                c_upper,
82                b'A' | b'R'
83                    | b'N'
84                    | b'D'
85                    | b'C'
86                    | b'E'
87                    | b'Q'
88                    | b'G'
89                    | b'H'
90                    | b'I'
91                    | b'L'
92                    | b'K'
93                    | b'M'
94                    | b'F'
95                    | b'P'
96                    | b'S'
97                    | b'T'
98                    | b'W'
99                    | b'Y'
100                    | b'V'
101                    | b'B'
102                    | b'J'
103                    | b'O'
104                    | b'U'
105                    | b'X'
106                    | b'Z'
107            ) {
108                valid_in_any = true;
109            } else {
110                is_protein = false;
111            }
112        }
113
114        // Early exit if invalid character
115        if !valid_in_any {
116            eprintln!("Invalid character: {}", c as char);
117        }
118        // Early exit if only one type is valid
119        if only_one_true(is_dna, is_rna, is_protein) {
120            break;
121        }
122    }
123
124    // Determine result by priority
125    if is_dna {
126        "DNA".into()
127    } else if is_rna {
128        "RNA".into()
129    } else if is_protein {
130        "Protein".into()
131    } else {
132        "Unknown sequence type".into()
133    }
134}
135
136fn only_one_true(a: bool, b: bool, c: bool) -> bool {
137    (a as u8 + b as u8 + c as u8) == 1
138}