seq_here/
utils.rs

1/// Utils
2///
3/// The module has some useful functions.
4///
5
6use std::{fs, io};
7use std::fs::File;
8use std::path::{Path, PathBuf};
9use crate::error::e_exit;
10use bio::io::{fasta, fastq, gff};
11use bio::io::gff::GffType;
12
13/// Inside defined file types
14pub enum FileType {
15    Fasta,
16    Fastq,
17    Gff,
18    Unknown,
19}
20
21impl FileType {
22    /// Infer file type by extension name.
23    pub fn infer_file_type(path: &PathBuf) -> FileType {
24        path.extension()
25            .and_then(|ext| ext.to_str())
26            .map(|ext| match ext.to_lowercase().as_str() {
27                "fa" | "fasta" | "pep" => FileType::Fasta,      // NOTE: PEP needs investigation
28                "gff" | "gff3" => FileType::Gff,
29                "fq" | "fastq" => FileType::Fastq,
30                _ => FileType::Unknown
31            })
32            .unwrap_or(FileType::Unknown)
33    }
34}
35
36
37/// Multiple format file writer based on [bio crate] .
38pub struct MultiFormatWriter {
39    pub fa: fasta::Writer<File>,
40    pub fq: fastq::Writer<File>,
41    pub gff: gff::Writer<File>,
42}
43
44impl MultiFormatWriter {
45    pub fn new(path: &PathBuf) -> io::Result<Self> {
46        let file = File::create(path)?;
47        Ok(Self {
48            fa: fasta::Writer::new(file.try_clone()?),
49            gff: gff::Writer::new(file.try_clone()?, GffType::GFF3),   // TODO: GFF Type
50            fq: fastq::Writer::new(file),
51        })
52    }
53}
54
55/// Get the sequence type from the file extension
56///
57pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
58    let ext = file.extension().unwrap().to_str().unwrap();
59    match ext {
60        "fasta" | "fa" => Ok("fasta".to_string()),
61        "fastq" | "fq" => Ok("fastq".to_string()),
62        "gff" | "gtf" => Ok("gff".to_string()),
63        "bed" => Ok("bed".to_string()),
64        "sam" => Ok("sam".to_string()),
65        "bam" => Ok("bam".to_string()),
66        _ => Err(format!("Unknown file extension: {:?}", ext).into()),
67    }
68}
69
70/// Check the sequence type by a fast way: see if some special symbols exist in the sequence
71///
72pub fn try_seq_type_seq(seq: &[u8]) -> String {
73    if seq.is_empty() {
74        eprintln!("Empty sequence");
75    }
76
77    let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
78    for &c in seq {
79        let c_upper = c.to_ascii_uppercase();
80        let mut valid_in_any = false;
81
82        // Check DNA validity
83        if is_dna {
84            // Some files may contain 'N' as a placeholder for unknown bases
85            if matches!(c_upper, b'A' | b'T' | b'C' | b'G' | b'N') {
86                valid_in_any = true;
87            } else {
88                is_dna = false;
89            }
90        }
91
92        // Check RNA validity
93        if is_rna {
94            if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
95                valid_in_any = true;
96            } else {
97                is_rna = false;
98            }
99        }
100
101        // Check Protein validity
102        if is_protein {
103            if matches!(
104                c_upper,
105                b'A' | b'R'
106                    | b'N'
107                    | b'D'
108                    | b'C'
109                    | b'E'
110                    | b'Q'
111                    | b'G'
112                    | b'H'
113                    | b'I'
114                    | b'L'
115                    | b'K'
116                    | b'M'
117                    | b'F'
118                    | b'P'
119                    | b'S'
120                    | b'T'
121                    | b'W'
122                    | b'Y'
123                    | b'V'
124                    | b'B'
125                    | b'J'
126                    | b'O'
127                    | b'U'
128                    | b'X'
129                    | b'Z'
130            ) {
131                valid_in_any = true;
132            } else {
133                is_protein = false;
134            }
135        }
136
137        // Early exit if invalid character
138        if !valid_in_any {
139            eprintln!("Invalid character: {}", c as char);
140        }
141        // Early exit if only one type is valid
142        if only_one_true(is_dna, is_rna, is_protein) {
143            break;
144        }
145    }
146
147    // Determine result by priority
148    // if `is_dna && is_protein` equals to true, the sequence is seen as DNA.
149    if is_dna {
150        "DNA".into()
151    } else if is_rna {
152        "RNA".into()
153    } else if is_protein {
154        "Protein".into()
155    } else {
156        "Unknown sequence type".into()
157    }
158}
159
160fn only_one_true(a: bool, b: bool, c: bool) -> bool {
161    (a as u8 + b as u8 + c as u8) == 1
162}
163
164/// Write `content` into file given by `path`
165///
166pub fn write_file<P: AsRef<Path>>(path: P, content: &str) {
167    fs::write(path, content).expect("Unable to write file");
168}
169
170/// See what type(file or dir) does the path behalf.
171pub fn is_directory_path(path: &PathBuf) -> bool {
172    path.extension().map_or(true, |ext| {
173        ext.is_empty() || path.as_os_str().to_str().unwrap().ends_with('.')
174    })
175}
176
177/// Creat empty file.
178///
179pub fn create_file_with_dir(path: &Path) {
180    if let Some(parent) = path.parent() {
181        fs::create_dir_all(parent).unwrap_or_else(|e| {
182            e_exit("DIR", &format!("Unable to create directory: {}", e), 1);
183        });
184    }
185
186    File::create(path).unwrap_or_else(|e| {
187        e_exit("FILE", &format!("Unable to create file: {}", e), 1);
188    });
189}