codonrs/
lib.rs

1
2pub mod analysis {
3    use serde::{Deserialize, Serialize};
4    use std::collections::{HashMap, HashSet};
5    use std::error::Error;
6    use std::fs::File;
7    use std::io::{self, BufRead, BufReader, Write};
8
9    static CODE_FILE: &str = include_str!("genetic_code.json");
10
11    /// Load the provided genetic_code.json data file
12    fn load_embedded_genetic_codes() -> Result<Vec<GeneticCodeJSON>, Box<dyn Error>> {
13        let genetic_codes: Vec<GeneticCodeJSON> = serde_json::from_str(CODE_FILE)?;
14        Ok(genetic_codes)
15    }
16
17    #[derive(Debug, Clone)]
18    pub struct GeneticCode {
19        id: String,
20        name: String,
21        codon_map: HashMap<String, String>,
22    }
23
24    /// A genetic code object, containing an id, name and translation table (codon_map)
25    impl GeneticCode {
26        pub fn new() -> Self {
27            GeneticCode {
28                id: String::new(),
29                name: String::new(),
30                codon_map: HashMap::new(),
31            }
32        }
33    }
34
35    /// Define the structure for the JSON format
36    #[derive(Debug, Serialize, Deserialize)]
37    struct GeneticCodeJSON {
38        name: Vec<String>,
39        id: u8,
40        ncbieaa: String,
41        sncbieaa: String,
42        base_mappings: HashMap<String, String>,
43    }
44
45    impl GeneticCodeJSON {
46        /// Convert the genetic code into a codon-to-amino-acid mapping
47        fn to_codon_map(&self) -> HashMap<String, String> {
48            let base1 = self
49                .base_mappings
50                .get("Base1")
51                .cloned()
52                .unwrap_or_else(|| String::new());
53            let base2 = self
54                .base_mappings
55                .get("Base2")
56                .cloned()
57                .unwrap_or_else(|| String::new());
58            let base3 = self
59                .base_mappings
60                .get("Base3")
61                .cloned()
62                .unwrap_or_else(|| String::new());
63
64            let mut codon_map = HashMap::new();
65
66            for (i, ((b1, b2), b3)) in base1
67                .chars()
68                .zip(base2.chars())
69                .zip(base3.chars())
70                .enumerate()
71            {
72                let codon = format!("{}{}{}", b1, b2, b3);
73                let amino_acid = self.ncbieaa.chars().nth(i).unwrap_or('*').to_string(); // Convert char to String
74                codon_map.insert(codon, amino_acid);
75            }
76
77            codon_map
78        }
79    }
80
81    /// Get a genetic code from a GeneticCode object by a provided ID
82    fn get_genetic_code_by_id<'a>(
83        codes: &'a [GeneticCodeJSON],
84        id: &u8,
85    ) -> Option<&'a GeneticCodeJSON> {
86        codes.iter().find(|code| code.id == *id)
87    }
88
89    /// Return a GeneticCode object give a translation table id (integer)
90    ///
91    /// # Arguments
92    ///
93    /// * `translation_table`: u8 representing the translation table
94    ///
95    /// # Returns
96    ///
97    /// GeneticCode object containing translation table of choice
98    pub fn genetic_code_from_id(translation_table: &u8) -> GeneticCode {
99        let mut code = GeneticCode::new();
100        if let Ok(json_codes) = load_embedded_genetic_codes() {
101            if let Some(selected_code) = get_genetic_code_by_id(&json_codes, translation_table) {
102                code.id = selected_code.id.to_string();
103                code.name = selected_code.name.first().cloned().unwrap_or_default();
104                code.codon_map = selected_code.to_codon_map();
105            } else {
106                eprintln!("Error: Genetic Code ID {} not found!", translation_table);
107            }
108        } else {
109            eprintln!("Failed to load genetic codes");
110        }
111        code
112    }
113
114    /// Parse a DNA sequence into codon counts.
115    ///
116    /// # Arguments
117    ///
118    /// * `sequence` - A string representing the DNA sequence.
119    ///
120    /// # Returns
121    ///
122    /// A result with a HashMap mapping codon strings to their occurrence count.
123    pub fn count_codons(sequence: &str) -> HashMap<String, usize> {
124        let mut codon_counts = HashMap::new();
125
126        // Ignore sequences that are not a multiple of 3
127        if sequence.len() % 3 != 0 {
128            return codon_counts; // Return an empty HashMap instead of None
129        }
130
131        for codon in sequence.as_bytes().chunks(3) {
132            let codon_str = String::from_utf8_lossy(codon).to_uppercase();
133            *codon_counts.entry(codon_str).or_insert(0) += 1;
134        }
135
136        codon_counts
137    }
138
139    /// Compute Relative Synonymous Codon Usage (RSCU) values.
140    ///
141    /// # Arguments
142    ///
143    /// * codon_counts - A hashmap with codon counts for the sequence.
144    /// * code - a GeneticCode object to be used for codon translation
145    ///
146    /// # Returns
147    ///
148    /// A result with a HashMap mapping codon strings to their RSCU value
149    pub fn compute_rscu(codon_counts: &HashMap<String, usize>, code: &GeneticCode) -> HashMap<String, f64> {
150        let codon_table = &code.codon_map;
151        let mut rscu_values = HashMap::new();
152        let mut amino_acid_totals: HashMap<&str, usize> = HashMap::new();
153        let mut synonymous_codons: HashMap<&str, Vec<&str>> = HashMap::new();
154    
155        // Group codons by their amino acid and count occurrences
156        for (codon, amino_acid) in codon_table {
157            synonymous_codons
158                .entry(amino_acid)
159                .or_insert(Vec::new())
160                .push(codon);
161            *amino_acid_totals.entry(amino_acid).or_insert(0) +=
162                codon_counts.get(codon).copied().unwrap_or(0);
163        }
164    
165        // Compute RSCU values
166        for (amino_acid, codons) in &synonymous_codons {
167            let total_codon_count = amino_acid_totals.get(amino_acid).copied().unwrap_or(0) as f64;
168            let num_codons = codons.len() as f64;
169    
170            for codon in codons {
171                let observed = *codon_counts.get(*codon).unwrap_or(&0) as f64;
172                let expected = total_codon_count / num_codons;
173                let rscu = if expected > 0.0 {
174                    observed / expected
175                } else {
176                    0.0
177                };
178    
179                rscu_values.insert((*codon).to_string(), rscu);
180            }
181        }
182    
183        rscu_values
184    }
185
186    /// Write RSCU values to a CSV file.
187    pub fn write_rscu_to_csv(
188        filename: &str,
189        rscu_data: &Vec<(String, HashMap<String, f64>)>,
190    ) -> std::io::Result<()> {
191        let mut file = File::create(filename)?;
192
193        // Collect all unique codons across sequences to create CSV headers
194        let mut codon_set = HashSet::new();
195        for (_, codon_map) in rscu_data {
196            for codon in codon_map.keys() {
197                codon_set.insert(codon.clone());
198            }
199        }
200
201        let mut codons: Vec<String> = codon_set.into_iter().collect();
202        codons.sort(); // Ensure consistent order
203
204        // Write the CSV header
205        write!(file, "Sequence")?;
206        for codon in &codons {
207            write!(file, ",{}", codon)?;
208        }
209        writeln!(file)?;
210
211        // Write RSCU values for each sequence
212        for (seq_name, codon_map) in rscu_data {
213            write!(file, "{}", seq_name)?;
214            for codon in &codons {
215                let rscu_value = codon_map.get(codon).unwrap_or(&0.0);
216                write!(file, ",{:.6}", rscu_value)?;
217            }
218            writeln!(file)?;
219        }
220
221        Ok(())
222    }
223
224    /// Compute the mean RSCU value for each codon
225    pub fn compute_mean_rscu(rscu_results: &Vec<(String, HashMap<String, f64>)>) -> HashMap<String, f64> {
226        let mut total_rscu: HashMap<String, f64> = HashMap::new();
227        let gene_count = rscu_results.len() as f64;
228
229        for (_, rscu_map) in rscu_results {
230            for (codon, value) in rscu_map {
231                *total_rscu.entry(codon.clone()).or_insert(0.0) += value;
232            }
233        }
234
235        let mut mean_rscu = HashMap::new();
236        for (codon, total_value) in total_rscu {
237            mean_rscu.insert(codon, total_value / gene_count);
238        }
239
240        mean_rscu
241    }
242
243    /// Compute standard deviation of RSCU values
244    pub fn compute_std_rscu(
245        rscu_results: &Vec<(String, HashMap<String, f64>)>,
246        mean_rscu: &HashMap<String, f64>,
247    ) -> HashMap<String, f64> {
248        let mut variance: HashMap<String, f64> = HashMap::new();
249        let gene_count = rscu_results.len() as f64;
250
251        for (_, rscu_map) in rscu_results {
252            for (codon, value) in rscu_map {
253                let mean = mean_rscu.get(codon).unwrap_or(&0.0);
254                let diff = value - mean;
255                *variance.entry(codon.clone()).or_insert(0.0) += diff * diff;
256            }
257        }
258
259        let mut std_dev = HashMap::new();
260        for (codon, var) in variance {
261            std_dev.insert(codon, (var / gene_count).sqrt());
262        }
263
264        std_dev
265    }
266
267    /// Compute Z-score from RSCU values
268    pub fn compute_rscu_z_scores(
269        rscu_results: &Vec<(String, HashMap<String, f64>)>,
270        mean_rscu: &HashMap<String, f64>,
271        std_rscu: &HashMap<String, f64>,
272    ) -> Vec<(String, HashMap<String, f64>)> {
273        let mut z_scores = Vec::new();
274
275        for (gene, rscu_map) in rscu_results {
276            let mut gene_z_scores = HashMap::new();
277            for (codon, value) in rscu_map {
278                let mean = mean_rscu.get(codon).unwrap_or(&0.0);
279                let std_dev = std_rscu.get(codon).unwrap_or(&1.0); // Avoid division by zero
280                let z_score = (value - mean) / std_dev;
281                gene_z_scores.insert(codon.clone(), z_score);
282            }
283            z_scores.push((gene.clone(), gene_z_scores));
284        }
285
286        z_scores
287    }
288
289    /// Write Z-scores to a CSV file
290    pub fn write_z_scores_to_csv(
291        filename: &str,
292        z_scores: &Vec<(String, HashMap<String, f64>)>,
293    ) -> io::Result<()> {
294        let mut file = File::create(filename)?;
295
296        let mut codon_set = HashSet::new();
297        for (_, codon_map) in z_scores {
298            for codon in codon_map.keys() {
299                codon_set.insert(codon.clone());
300            }
301        }
302
303        let mut codons: Vec<String> = codon_set.into_iter().collect();
304        codons.sort();
305
306        // Write header
307        write!(file, "Gene")?;
308        for codon in &codons {
309            write!(file, ",{}", codon)?;
310        }
311        writeln!(file)?;
312
313        // Write Z-scores for each gene
314        for (gene, codon_map) in z_scores {
315            write!(file, "{}", gene)?;
316            for codon in &codons {
317                let z_score = codon_map.get(codon).unwrap_or(&0.0);
318                write!(file, ",{:.6}", z_score)?;
319            }
320            writeln!(file)?;
321        }
322
323        Ok(())
324    }
325
326    /// Count the amino acids for a CDS
327    ///
328    /// # Arguments
329    ///
330    /// * sequence: a str of the sequence to be analysed
331    /// * code: a GeneticCode object to be used for codon translation
332    ///
333    /// # Returns
334    ///
335    /// A result with a HashMap mapping amino acids to their count
336    pub fn count_amino_acids(sequence: &str, code: &GeneticCode) -> HashMap<String, usize> {
337        let codon_table = &code.codon_map;
338        let mut amino_acid_counts = HashMap::new();
339
340        for codon in sequence.as_bytes().chunks(3) {
341            if codon.len() == 3 {
342                let codon_str = String::from_utf8_lossy(codon).to_uppercase();
343                if let Some(amino_acid) = codon_table.get(codon_str.as_str()) {
344                    *amino_acid_counts.entry(amino_acid.to_string()).or_insert(0) += 1;
345                }
346            }
347        }
348
349        amino_acid_counts
350    }
351
352    /// Write codon and amino acid counts to a CSV file
353    ///
354    /// # Arguments
355    ///
356    /// * filename_prefix: str to be used as prefix for output files
357    /// * codon_data: The codon counts to be written
358    /// * amino_acid_data: The amino acid counts to be written
359    /// 
360    /// # Returns
361    ///
362    /// A result with two output files: `prefix`_codon.csv for codons and `prefix`_amino_acids.csv for amino acids
363    pub fn write_counts_to_csv(
364        filename_prefix: &str,
365        codon_data: &Vec<(String, HashMap<String, usize>)>,
366        amino_acid_data: &Vec<(String, HashMap<String, usize>)>,
367    ) -> std::io::Result<()> {
368        let codon_filename = format!("{}_codon.csv", filename_prefix);
369        let amino_filename = format!("{}_amino_acids.csv", filename_prefix);
370
371        // Create files
372        let mut codon_file = File::create(&codon_filename)?;
373        let mut amino_file = File::create(&amino_filename)?;
374
375        // Collect all unique codons and amino acids across sequences
376        let mut codon_set = HashSet::new();
377        let mut amino_acid_set = HashSet::new();
378
379        for (_, codon_map) in codon_data {
380            for codon in codon_map.keys() {
381                codon_set.insert(codon.clone());
382            }
383        }
384
385        for (_, amino_map) in amino_acid_data {
386            for amino in amino_map.keys() {
387                amino_acid_set.insert(amino.clone());
388            }
389        }
390
391        let mut codons: Vec<String> = codon_set.into_iter().collect();
392        let mut amino_acids: Vec<String> = amino_acid_set.into_iter().collect();
393        codons.sort(); // Ensure consistent order
394        amino_acids.sort();
395
396        // Write the CSV header for codons
397        write!(codon_file, "Sequence")?;
398        for codon in &codons {
399            write!(codon_file, ",{}", codon)?;
400        }
401        writeln!(codon_file)?;
402
403        // Write codon counts for each sequence
404        for (seq_name, codon_map) in codon_data {
405            write!(codon_file, "{}", seq_name)?;
406            for codon in &codons {
407                let count = codon_map.get(codon).unwrap_or(&0);
408                write!(codon_file, ",{}", count)?;
409            }
410            writeln!(codon_file)?;
411        }
412
413        // Write the CSV header for amino acids
414        write!(amino_file, "Sequence")?;
415        for amino in &amino_acids {
416            write!(amino_file, ",{}", amino)?;
417        }
418        writeln!(amino_file)?;
419
420        // Write amino acid counts for each sequence
421        for (seq_name, amino_map) in amino_acid_data {
422            write!(amino_file, "{}", seq_name)?;
423            for amino in &amino_acids {
424                let count = amino_map.get(amino).unwrap_or(&0);
425                write!(amino_file, ",{}", count)?;
426            }
427            writeln!(amino_file)?;
428        }
429
430        Ok(())
431    }
432
433    /// Read sequences from a multi-FASTA file
434    ///
435    /// # Arguments
436    ///
437    /// * filename_prefix: str to be used as prefix for output files
438    /// * codon_data: The codon counts to be written
439    /// * amino_acid_data: The amino acid counts to be written
440    /// 
441    /// # Returns
442    ///
443    /// A result with two output files: `prefix`_codon.csv for codons and `prefix`_amino_acids.csv for amino acids
444    pub fn read_sequences_from_fasta(filename: &str) -> io::Result<Vec<(String, String)>> {
445        let file = File::open(filename)?;
446        let reader = BufReader::new(file);
447
448        let mut sequences = Vec::new();
449        let mut current_seq_name = String::new();
450        let mut current_sequence = String::new();
451
452        for line in reader.lines() {
453            let line = line?;
454            if line.starts_with('>') {
455                if !current_seq_name.is_empty() {
456                    sequences.push((current_seq_name.clone(), current_sequence.clone()));
457                    current_sequence.clear();
458                }
459                current_seq_name = line[1..].to_string();
460            } else {
461                current_sequence.push_str(&line);
462            }
463        }
464        if !current_seq_name.is_empty() {
465            sequences.push((current_seq_name, current_sequence));
466        }
467
468        Ok(sequences)
469    }
470}