seq_here/
utils.rs

1// use std::path::{Path, PathBuf};
2// use super::error::Result;
3//
4// pub fn validate_files(paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
5//     if paths.is_empty() {
6//         return Err("No input files provided.".into());
7//     }
8//
9//     let mut files = Vec::new();
10//     for f in paths {
11//
12//         let file = Path::new(f.as_);
13//         if file.is_dir() {
14//
15//             for e in file.read_dir()? {
16//                 let e = e?;
17//                 let path = e.path();
18//                 if path.is_file() {
19//                     files.push(path);
20//                 }
21//             }
22//             return Err(format!("Directory provided: {:?}", f).into());
23//         }
24//         if file.is_file() {
25//             files.push(f);
26//         }
27//     }
28//
29//     Ok(files)
30//
31// }
32
33use std::path::Path;
34
35// Get the sequence type from the file extension
36pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
37    let ext = file.extension().unwrap().to_str().unwrap();
38    match ext {
39        "fasta" | "fa" => Ok("fasta".to_string()),
40        "fastq" | "fq" => Ok("fastq".to_string()),
41        "gff" | "gtf" => Ok("gff".to_string()),
42        "bed" => Ok("bed".to_string()),
43        "sam" => Ok("sam".to_string()),
44        "bam" => Ok("bam".to_string()),
45        _ => Err(format!("Unknown file extension: {:?}", ext).into()),
46    }
47}
48
49
50// Check the sequence type by a fast way: see if some special symbols exist in the sequence
51pub fn try_seq_type_seq(seq: &[u8]) -> Result<String, Box<dyn std::error::Error>> {
52    if seq.is_empty() {
53        return Err("Empty sequence".into());
54    }
55
56    let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
57    for &c in seq {
58        let c_upper = c.to_ascii_uppercase();
59        let mut valid_in_any = false;
60
61        // Check DNA validity
62        if is_dna {
63            if matches!(c_upper, b'A' | b'T' | b'C' | b'G') {
64                valid_in_any = true;
65            } else {
66                is_dna = false;
67            }
68        }
69
70        // Check RNA validity
71        if is_rna {
72            if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
73                valid_in_any = true;
74            } else {
75                is_rna = false;
76            }
77        }
78
79        // Check Protein validity
80        if is_protein {
81            if matches!(
82                c_upper,
83                b'A' | b'R' | b'N' | b'D' | b'C' | b'E' | b'Q' | b'G' |
84                b'H' | b'I' | b'L' | b'K' | b'M' | b'F' | b'P' | b'S' |
85                b'T' | b'W' | b'Y' | b'V' | b'B' | b'J' | b'O' | b'U' |
86                b'X' | b'Z'
87            ) {
88                valid_in_any = true;
89            } else {
90                is_protein = false;
91            }
92        }
93
94        // Early exit if invalid character
95        if !valid_in_any {
96            return Err(format!("Invalid character: {}", c as char).into());
97        }
98    }
99
100    // Determine result by priority
101    if is_dna {
102        Ok("DNA".into())
103    } else if is_rna {
104        Ok("RNA".into())
105    } else if is_protein {
106        Ok("Protein".into())
107    } else {
108        Err("Unknown sequence type".into())
109    }
110}