seq_here/
utils.rs

1// use std::path::{Path, PathBuf};
2// use super::error::Result;
3//
4// pub fn validate_files(paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
5//     if paths.is_empty() {
6//         return Err("No input files provided.".into());
7//     }
8//
9//     let mut files = Vec::new();
10//     for f in paths {
11//
12//         let file = Path::new(f.as_);
13//         if file.is_dir() {
14//
15//             for e in file.read_dir()? {
16//                 let e = e?;
17//                 let path = e.path();
18//                 if path.is_file() {
19//                     files.push(path);
20//                 }
21//             }
22//             return Err(format!("Directory provided: {:?}", f).into());
23//         }
24//         if file.is_file() {
25//             files.push(f);
26//         }
27//     }
28//
29//     Ok(files)
30//
31// }
32
33use std::fs;
34use std::path::Path;
35
36/// Get the sequence type from the file extension
37pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
38    let ext = file.extension().unwrap().to_str().unwrap();
39    match ext {
40        "fasta" | "fa" => Ok("fasta".to_string()),
41        "fastq" | "fq" => Ok("fastq".to_string()),
42        "gff" | "gtf" => Ok("gff".to_string()),
43        "bed" => Ok("bed".to_string()),
44        "sam" => Ok("sam".to_string()),
45        "bam" => Ok("bam".to_string()),
46        _ => Err(format!("Unknown file extension: {:?}", ext).into()),
47    }
48}
49
50/// Check the sequence type by a fast way: see if some special symbols exist in the sequence
51pub fn try_seq_type_seq(seq: &[u8]) -> String {
52    if seq.is_empty() {
53        eprintln!("Empty sequence");
54    }
55
56    let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
57    for &c in seq {
58        let c_upper = c.to_ascii_uppercase();
59        let mut valid_in_any = false;
60
61        // Check DNA validity
62        if is_dna {
63            // Some files may contain 'N' as a placeholder for unknown bases
64            if matches!(c_upper, b'A' | b'T' | b'C' | b'G' | b'N') {
65                valid_in_any = true;
66            } else {
67                is_dna = false;
68            }
69        }
70
71        // Check RNA validity
72        if is_rna {
73            if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
74                valid_in_any = true;
75            } else {
76                is_rna = false;
77            }
78        }
79
80        // Check Protein validity
81        if is_protein {
82            if matches!(
83                c_upper,
84                b'A' | b'R'
85                    | b'N'
86                    | b'D'
87                    | b'C'
88                    | b'E'
89                    | b'Q'
90                    | b'G'
91                    | b'H'
92                    | b'I'
93                    | b'L'
94                    | b'K'
95                    | b'M'
96                    | b'F'
97                    | b'P'
98                    | b'S'
99                    | b'T'
100                    | b'W'
101                    | b'Y'
102                    | b'V'
103                    | b'B'
104                    | b'J'
105                    | b'O'
106                    | b'U'
107                    | b'X'
108                    | b'Z'
109            ) {
110                valid_in_any = true;
111            } else {
112                is_protein = false;
113            }
114        }
115
116        // Early exit if invalid character
117        if !valid_in_any {
118            eprintln!("Invalid character: {}", c as char);
119        }
120        // Early exit if only one type is valid
121        if only_one_true(is_dna, is_rna, is_protein) {
122            break;
123        }
124    }
125
126    // Determine result by priority
127    // if `is_dna && is_protein` equals to true, the sequence is seen as DNA.
128    if is_dna {
129        "DNA".into()
130    } else if is_rna {
131        "RNA".into()
132    } else if is_protein {
133        "Protein".into()
134    } else {
135        "Unknown sequence type".into()
136    }
137}
138
139fn only_one_true(a: bool, b: bool, c: bool) -> bool {
140    (a as u8 + b as u8 + c as u8) == 1
141}
142
143/// Write `content` into file given by `path`
144///
145pub fn write_file<P: AsRef<Path>>(path: P, content: &str) {
146    fs::write(path, content).expect("Unable to write file");
147}