seq_here/
utils.rs

1/// Utils
2///
3/// The module has some useful functions for bioinformatics file handling and sequence analysis.
4///
5
6use std::{fs, io};
7use std::fs::File;
8use std::path::{Path, PathBuf};
9use crate::error::e_exit;
10use bio::io::{fasta, fastq, gff};
11use bio::io::gff::GffType;
12
13/// Enumeration of supported bioinformatics file types
14/// Used for file type detection and handling
15pub enum FileType {
16    Fasta,  // FASTA sequence files (.fa, .fasta)
17    Fastq,  // FASTQ sequence files (.fq, .fastq)
18    Gff,    // GFF annotation files (.gff, .gff3)
19    Unknown, // Unrecognized file format
20}
21
22impl FileType {
23    /// Infers the biological file type based on the file extension.
24    /// 
25    /// # Arguments
26    /// * `path` - PathBuf pointing to the file to analyze
27    ///
28    /// # Returns
29    /// * `FileType` enum representing the detected file type
30    pub fn infer_file_type(path: &PathBuf) -> FileType {
31        path.extension()
32            .and_then(|ext| ext.to_str())
33            .map(|ext| match ext.to_lowercase().as_str() {
34                "fa" | "fasta" | "pep" => FileType::Fasta,      // DNA/protein sequence files
35                "gff" | "gff3" => FileType::Gff,                // Gene feature format
36                "fq" | "fastq" => FileType::Fastq,              // Sequence with quality scores
37                _ => FileType::Unknown
38            })
39            .unwrap_or(FileType::Unknown)
40    }
41}
42
43
44/// Multiple format file writer based on [bio crate].
45/// Provides a unified interface for writing different bioinformatics file formats.
46pub struct MultiFormatWriter {
47    pub fa: fasta::Writer<File>,  // For writing FASTA format files
48    pub fq: fastq::Writer<File>,  // For writing FASTQ format files
49    pub gff: gff::Writer<File>,   // For writing GFF/GTF format files
50}
51
52impl MultiFormatWriter {
53    /// Creates a new MultiFormatWriter that can write to different biological file formats.
54    ///
55    /// # Arguments
56    /// * `path` - PathBuf indicating where to create the output file
57    ///
58    /// # Returns
59    /// * `io::Result<Self>` - The writer instance or an IO error
60    pub fn new(path: &PathBuf) -> io::Result<Self> {
61        let file = File::create(path)?;
62        Ok(Self {
63            fa: fasta::Writer::new(file.try_clone()?),
64            gff: gff::Writer::new(file.try_clone()?, GffType::GFF3),   // Default to GFF3 format
65            fq: fastq::Writer::new(file),
66        })
67    }
68}
69
70/// Determines file type based on file extension
71///
72/// # Arguments
73/// * `file` - Path to the file to analyze
74///
75/// # Returns
76/// * `Result<String, Box<dyn std::error::Error>>` - String representation of file type or an error
77pub fn try_file_type_ext(file: &Path) -> Result<String, Box<dyn std::error::Error>> {
78    let ext = file.extension().unwrap().to_str().unwrap();
79    match ext {
80        "fasta" | "fa" => Ok("fasta".to_string()),  // FASTA sequence files
81        "fastq" | "fq" => Ok("fastq".to_string()),  // FASTQ sequence files
82        "gff" | "gtf" => Ok("gff".to_string()),     // Gene annotation files
83        "bed" => Ok("bed".to_string()),             // Browser Extensible Data format
84        "sam" => Ok("sam".to_string()),             // Sequence Alignment/Map format
85        "bam" => Ok("bam".to_string()),             // Binary version of SAM
86        _ => Err(format!("Unknown file extension: {:?}", ext).into()),
87    }
88}
89
90/// Determines the biological sequence type by analyzing its content
91///
92/// Uses a heuristic approach to check if the sequence conforms to DNA, RNA, or protein alphabets.
93/// In case of ambiguity, prioritizes DNA > RNA > Protein classification.
94///
95/// # Arguments
96/// * `seq` - Byte slice containing the sequence to analyze
97///
98/// # Returns
99/// * `String` - The determined sequence type ("DNA", "RNA", "Protein", or "Unknown")
100pub fn try_seq_type_seq(seq: &[u8]) -> String {
101    if seq.is_empty() {
102        eprintln!("Empty sequence");
103    }
104
105    // Track validity flags for each sequence type
106    let (mut is_dna, mut is_rna, mut is_protein) = (true, true, true);
107    
108    for &c in seq {
109        let c_upper = c.to_ascii_uppercase();
110        let mut valid_in_any = false;
111
112        // Check DNA validity - valid chars are A, T, C, G, N
113        if is_dna {
114            // N is commonly used as a placeholder for unknown nucleotides
115            if matches!(c_upper, b'A' | b'T' | b'C' | b'G' | b'N') {
116                valid_in_any = true;
117            } else {
118                is_dna = false;
119            }
120        }
121
122        // Check RNA validity - valid chars are A, U, C, G
123        if is_rna {
124            if matches!(c_upper, b'A' | b'U' | b'C' | b'G') {
125                valid_in_any = true;
126            } else {
127                is_rna = false;
128            }
129        }
130
131        // Check Protein validity - standard amino acid codes
132        if is_protein {
133            if matches!(
134                c_upper,
135                b'A' | b'R'
136                    | b'N'
137                    | b'D'
138                    | b'C'
139                    | b'E'
140                    | b'Q'
141                    | b'G'
142                    | b'H'
143                    | b'I'
144                    | b'L'
145                    | b'K'
146                    | b'M'
147                    | b'F'
148                    | b'P'
149                    | b'S'
150                    | b'T'
151                    | b'W'
152                    | b'Y'
153                    | b'V'
154                    | b'B'
155                    | b'J'
156                    | b'O'
157                    | b'U'
158                    | b'X'
159                    | b'Z'
160            ) {
161                valid_in_any = true;
162            } else {
163                is_protein = false;
164            }
165        }
166
167        // Early exit if invalid character
168        if !valid_in_any {
169            eprintln!("Invalid character: {}", c as char);
170        }
171        
172        // Optimization: Early exit if only one type remains valid
173        if only_one_true(is_dna, is_rna, is_protein) {
174            break;
175        }
176    }
177
178    // Determine result by priority: DNA > RNA > Protein
179    // If sequence could be multiple types (e.g., is_dna && is_protein),
180    // we classify as the highest priority type
181    if is_dna {
182        "DNA".into()
183    } else if is_rna {
184        "RNA".into()
185    } else if is_protein {
186        "Protein".into()
187    } else {
188        "Unknown sequence type".into()
189    }
190}
191
192/// Utility function that returns true only if exactly one of the three boolean parameters is true
193///
194/// # Arguments
195/// * `a`, `b`, `c` - Three boolean values to check
196///
197/// # Returns
198/// * `bool` - True if exactly one parameter is true, false otherwise
199fn only_one_true(a: bool, b: bool, c: bool) -> bool {
200    (a as u8 + b as u8 + c as u8) == 1
201}
202
203/// Writes string content to a file at the specified path
204///
205/// # Arguments
206/// * `path` - Path where the file should be written
207/// * `content` - String content to write to the file
208pub fn write_file<P: AsRef<Path>>(path: P, content: &str) {
209    fs::write(path, content).expect("Unable to write file");
210}
211
212/// Determines if a path represents a directory
213///
214/// This function uses heuristics based on file extension to guess if the path
215/// is meant to be a directory rather than actually checking the filesystem.
216///
217/// # Arguments
218/// * `path` - PathBuf to analyze
219///
220/// # Returns
221/// * `bool` - True if the path likely represents a directory, false otherwise
222pub fn is_directory_path(path: &PathBuf) -> bool {
223    path.extension().map_or(true, |ext| {
224        ext.is_empty() || path.as_os_str().to_str().unwrap().ends_with('.')
225    })
226}
227
228/// Creates an empty file and ensures its parent directories exist
229///
230/// # Arguments
231/// * `path` - Path where the file should be created
232pub fn create_file_with_dir(path: &Path) {
233    // First ensure parent directories exist
234    if let Some(parent) = path.parent() {
235        fs::create_dir_all(parent).unwrap_or_else(|e| {
236            e_exit("DIR", &format!("Unable to create directory: {}", e), 1);
237        });
238    }
239
240    // Then create the file
241    File::create(path).unwrap_or_else(|e| {
242        e_exit("FILE", &format!("Unable to create file: {}", e), 1);
243    });
244}