seq_here/utils.rs
1/// Utils
2///
3/// The module has some useful functions for bioinformatics file handling and sequence analysis.
4///
5
6use std::{fs, io};
7use std::fs::File;
8use std::path::{Path, PathBuf};
9use crate::error::e_exit;
10use bio::io::{fasta, fastq, gff};
11use bio::io::gff::GffType;
12
13/// Enumeration of supported bioinformatics file types
14/// Used for file type detection and handling
15pub enum FileType {
16 Fasta, // FASTA sequence files (.fa, .fasta)
17 Fastq, // FASTQ sequence files (.fq, .fastq)
18 Gff, // GFF annotation files (.gff, .gff3)
19 Unknown, // Unrecognized file format
20}
21
22impl FileType {
23 /// Infers the biological file type based on the file extension.
24 ///
25 /// # Arguments
26 /// * `path` - PathBuf pointing to the file to analyze
27 ///
28 /// # Returns
29 /// * `FileType` enum representing the detected file type
30 pub fn infer_file_type(path: &PathBuf) -> FileType {
31 path.extension()
32 .and_then(|ext| ext.to_str())
33 .map(|ext| match ext.to_lowercase().as_str() {
34 "fa" | "fasta" | "pep" => FileType::Fasta, // DNA/protein sequence files
35 "gff" | "gff3" => FileType::Gff, // Gene feature format
36 "fq" | "fastq" => FileType::Fastq, // Sequence with quality scores
37 _ => FileType::Unknown
38 })
39 .unwrap_or(FileType::Unknown)
40 }
41}
42
43
44/// Multiple format file writer based on [bio crate].
45/// Provides a unified interface for writing different bioinformatics file formats.
46pub struct MultiFormatWriter {
47 pub fa: fasta::Writer<File>, // For writing FASTA format files
48 pub fq: fastq::Writer<File>, // For writing FASTQ format files
49 pub gff: gff::Writer<File>, // For writing GFF/GTF format files
50}
51
52impl MultiFormatWriter {
53 /// Creates a new MultiFormatWriter that can write to different biological file formats.
54 ///
55 /// # Arguments
56 /// * `path` - PathBuf indicating where to create the output file
57 ///
58 /// # Returns
59 /// * `io::Result<Self>` - The writer instance or an IO error
60 pub fn new(path: &PathBuf) -> io::Result<Self> {
61 let file = File::create(path)?;
62 Ok(Self {
63 fa: fasta::Writer::new(file.try_clone()?),
64 gff: gff::Writer::new(file.try_clone()?, GffType::GFF3), // Default to GFF3 format
65 fq: fastq::Writer::new(file),
66 })
67 }
68}
69
70/// Determines file type based on file extension
71///
72/// # Arguments
73/// * `file` - Path to the file to analyze
74///
75/// # Returns
76/// * `Result<String, Box<dyn std::error::Error>>` - String representation of file type or an error
77pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
78 let ext = file.extension().unwrap().to_str().unwrap();
79 match ext {
80 "fasta" | "fa" => Ok("fasta".to_string()), // FASTA sequence files
81 "fastq" | "fq" => Ok("fastq".to_string()), // FASTQ sequence files
82 "gff" | "gtf" => Ok("gff".to_string()), // Gene annotation files
83 "bed" => Ok("bed".to_string()), // Browser Extensible Data format
84 "sam" => Ok("sam".to_string()), // Sequence Alignment/Map format
85 "bam" => Ok("bam".to_string()), // Binary version of SAM
86 _ => Err(format!("Unknown file extension: {:?}", ext).into()),
87 }
88}
89
90/// Determines the biological sequence type by analyzing its content
91///
92/// Uses a heuristic approach to check if the sequence conforms to DNA, RNA, or protein alphabets.
93/// In case of ambiguity, prioritizes DNA > RNA > Protein classification.
94///
95/// # Arguments
96/// * `seq` - Byte slice containing the sequence to analyze
97///
98/// # Returns
99/// * `String` - The determined sequence type ("DNA", "RNA", "Protein", or "Unknown")
100pub fn try_seq_type_seq(seq: &[u8]) -> String {
101 if seq.is_empty() {
102 eprintln!("Empty sequence");
103 }
104
105 // Track validity flags for each sequence type
106 let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
107
108 for &c in seq {
109 let c_upper = c.to_ascii_uppercase();
110 let mut valid_in_any = false;
111
112 // Check DNA validity - valid chars are A, T, C, G, N
113 if is_dna {
114 // N is commonly used as a placeholder for unknown nucleotides
115 if matches!(c_upper, b'A' | b'T' | b'C' | b'G' | b'N') {
116 valid_in_any = true;
117 } else {
118 is_dna = false;
119 }
120 }
121
122 // Check RNA validity - valid chars are A, U, C, G
123 if is_rna {
124 if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
125 valid_in_any = true;
126 } else {
127 is_rna = false;
128 }
129 }
130
131 // Check Protein validity - standard amino acid codes
132 if is_protein {
133 if matches!(
134 c_upper,
135 b'A' | b'R'
136 | b'N'
137 | b'D'
138 | b'C'
139 | b'E'
140 | b'Q'
141 | b'G'
142 | b'H'
143 | b'I'
144 | b'L'
145 | b'K'
146 | b'M'
147 | b'F'
148 | b'P'
149 | b'S'
150 | b'T'
151 | b'W'
152 | b'Y'
153 | b'V'
154 | b'B'
155 | b'J'
156 | b'O'
157 | b'U'
158 | b'X'
159 | b'Z'
160 ) {
161 valid_in_any = true;
162 } else {
163 is_protein = false;
164 }
165 }
166
167 // Early exit if invalid character
168 if !valid_in_any {
169 eprintln!("Invalid character: {}", c as char);
170 }
171
172 // Optimization: Early exit if only one type remains valid
173 if only_one_true(is_dna, is_rna, is_protein) {
174 break;
175 }
176 }
177
178 // Determine result by priority: DNA > RNA > Protein
179 // If sequence could be multiple types (e.g., is_dna && is_protein),
180 // we classify as the highest priority type
181 if is_dna {
182 "DNA".into()
183 } else if is_rna {
184 "RNA".into()
185 } else if is_protein {
186 "Protein".into()
187 } else {
188 "Unknown sequence type".into()
189 }
190}
191
192/// Utility function that returns true only if exactly one of the three boolean parameters is true
193///
194/// # Arguments
195/// * `a`, `b`, `c` - Three boolean values to check
196///
197/// # Returns
198/// * `bool` - True if exactly one parameter is true, false otherwise
199fn only_one_true(a: bool, b: bool, c: bool) -> bool {
200 (a as u8 + b as u8 + c as u8) == 1
201}
202
203/// Writes string content to a file at the specified path
204///
205/// # Arguments
206/// * `path` - Path where the file should be written
207/// * `content` - String content to write to the file
208pub fn write_file<P: AsRef<Path>>(path: P, content: &str) {
209 fs::write(path, content).expect("Unable to write file");
210}
211
212/// Determines if a path represents a directory
213///
214/// This function uses heuristics based on file extension to guess if the path
215/// is meant to be a directory rather than actually checking the filesystem.
216///
217/// # Arguments
218/// * `path` - PathBuf to analyze
219///
220/// # Returns
221/// * `bool` - True if the path likely represents a directory, false otherwise
222pub fn is_directory_path(path: &PathBuf) -> bool {
223 path.extension().map_or(true, |ext| {
224 ext.is_empty() || path.as_os_str().to_str().unwrap().ends_with('.')
225 })
226}
227
228/// Creates an empty file and ensures its parent directories exist
229///
230/// # Arguments
231/// * `path` - Path where the file should be created
232pub fn create_file_with_dir(path: &Path) {
233 // First ensure parent directories exist
234 if let Some(parent) = path.parent() {
235 fs::create_dir_all(parent).unwrap_or_else(|e| {
236 e_exit("DIR", &format!("Unable to create directory: {}", e), 1);
237 });
238 }
239
240 // Then create the file
241 File::create(path).unwrap_or_else(|e| {
242 e_exit("FILE", &format!("Unable to create file: {}", e), 1);
243 });
244}