gfatk/
utils.rs

1use anyhow::{bail, Context, Result};
2use atty::Stream;
3use gfa::optfields::{OptField, OptFieldVal::*};
4use petgraph::graph::NodeIndex;
5use std::collections::HashMap;
6use std::fmt;
7
8/// Format a sequence length (`usize`) to kilobases.
9pub fn format_usize_to_kb(num: usize) -> String {
10    let div = num as f32 / 1000f32;
11    format!("{:.2}Kb", div)
12}
13
14/// Check if there is anything coming from STDIN.
15pub fn is_stdin() -> bool {
16    !atty::is(Stream::Stdin)
17}
18
19/// Get the coverage associated with an edge (`ec` tag in the GFA).
20pub fn get_edge_coverage(options: &[OptField]) -> Result<i64> {
21    if let Some(op) = options.iter().next() {
22        match op.tag {
23            // ec
24            [101, 99] => match op.value {
25                Int(i) => return Ok(i),
26                _ => bail!("Could not find integer ec:i:<i64> tag."),
27            },
28            _ => bail!("Could not find ec (edge coverage) tag."),
29        };
30    }
31    bail!("Edge coverage not found.")
32}
33
34/// Format a GFA option field into a string.
35pub fn get_option_string(options: Vec<OptField>) -> Result<String> {
36    let mut tag_val = String::new();
37    for op in options {
38        let tag = std::str::from_utf8(&op.tag)
39            .with_context(|| format!("Malformed UTF8: {:?}", op.tag))?;
40        let value = match op.value {
41            Float(f) => format!(":f:{:.3}", f),
42            A(a) => format!(":A:{}", a),
43            Int(i) => format!(":i:{}", i),
44            Z(z) => format!(
45                ":Z:{}",
46                std::str::from_utf8(&z).with_context(|| format!("Malformed UTF8: {:?}", z))?
47            ),
48            // J(j) => ???,
49            // a hexadecimal array
50            H(h) => format!(":H:{}", h.iter().map(|x| x.to_string()).collect::<String>()),
51            // B is a general array
52            // is it capital B?
53            BInt(bi) => format!(
54                ":B:{}",
55                bi.iter().map(|x| x.to_string()).collect::<String>()
56            ),
57            BFloat(bf) => format!(
58                ":B:{}",
59                bf.iter().map(|x| format!("{:.3}", x)).collect::<String>()
60            ),
61            _ => "".to_string(),
62        };
63        tag_val += &format!("{}{}\t", tag, value);
64    }
65    // should always end in \t ^
66    let tag_val_op_un = tag_val
67        .strip_suffix('\t')
68        .context("Could not strip a tab from the suffix of the tag.")?;
69    Ok(tag_val_op_un.to_string())
70}
71
72/// Parse a CIGAR string slice into an overlap length.
73pub fn parse_cigar(cigar: &[u8]) -> Result<usize> {
74    // check it ends with an M
75    if !cigar.ends_with(&[77]) {
76        bail!(
77            "CIGAR string did not end with M: {}",
78            std::str::from_utf8(cigar).with_context(|| format!("Malformed UTF8: {:?}", cigar))?
79        );
80    }
81    let stripped = cigar.strip_suffix(&[77]);
82    match stripped {
83        Some(s) => {
84            let string_rep =
85                std::str::from_utf8(s).with_context(|| format!("Malformed UTF8: {:?}", s))?;
86            Ok(string_rep
87                .parse::<usize>()
88                .with_context(|| format!("{} could not be parsed to <usize>", string_rep))?)
89        }
90        None => bail!("Could not strip suffix (M) of the CIGAR string."),
91    }
92}
93
94/// Reverse complement a string slice.
95pub fn reverse_complement(dna: &[u8]) -> Vec<u8> {
96    let dna_vec = dna.to_vec();
97    let mut revcomp = Vec::new();
98
99    for base in dna_vec.iter() {
100        revcomp.push(switch_base(*base))
101    }
102    revcomp.as_mut_slice().reverse();
103    revcomp
104}
105
106/// Used in `reverse_complement` to switch to a complementary base.
107fn switch_base(c: u8) -> u8 {
108    match c {
109        b'A' => b'T',
110        b'a' => b't',
111        b'C' => b'G',
112        b'c' => b'g',
113        b'T' => b'A',
114        b't' => b'a',
115        b'G' => b'C',
116        b'g' => b'c',
117        b'N' => b'N',
118        b'n' => b'n',
119        _ => b'N',
120    }
121}
122
123// pinched from past Max
124// https://github.com/tolkit/fasta_windows/blob/master/src/seq_statsu8.rs
125
126/// Collect nucleotide counts into a `HashMap`.
127fn nucleotide_counts(dna: &[u8]) -> HashMap<&u8, i32> {
128    let mut map = HashMap::new();
129    for nucleotide in dna {
130        let count = map.entry(nucleotide).or_insert(0);
131        *count += 1;
132    }
133    map
134}
135
136/// Calculate the GC content of a string slice.
137pub fn gc_content(dna: &[u8]) -> f32 {
138    // G/C/A/T counts
139    // upper + lower
140    let counts = nucleotide_counts(dna);
141    let g_counts = counts.get(&71).unwrap_or(&0) + counts.get(&103).unwrap_or(&0);
142    let c_counts = counts.get(&67).unwrap_or(&0) + counts.get(&99).unwrap_or(&0);
143    let a_counts = counts.get(&65).unwrap_or(&0) + counts.get(&97).unwrap_or(&0);
144    let t_counts = counts.get(&84).unwrap_or(&0) + counts.get(&116).unwrap_or(&0);
145
146    (g_counts + c_counts) as f32 / (g_counts + c_counts + a_counts + t_counts) as f32
147}
148
149// convert Node Index to segment ID and vice versa
150// I rely a lot on this tuple:
151// (NodeIndex, usize)
152// which stores the node index and it's corresponding segment ID
153// I just realise this should 100000% be a hashmap... change that later.
154
155/// A pair consisting of a node index and a segment ID.
156#[derive(Clone, Copy, Debug)]
157pub struct GFAGraphPair {
158    /// The node index (petgraph's `NodeIndex`).
159    pub node_index: NodeIndex,
160    /// The segment ID.
161    pub seg_id: usize,
162}
163/// A vector of `GFAGraphPair`'s.
164///
165/// This should 100% have been a map-like structure...
166#[derive(Clone, Debug)]
167pub struct GFAGraphLookups(pub Vec<GFAGraphPair>);
168
169impl GFAGraphLookups {
170    /// Create a new GFAGraphLookups
171    pub fn new() -> Self {
172        Self(Vec::new())
173    }
174    /// Push a new `GFAGraphPair` to the end.
175    pub fn push(&mut self, other: GFAGraphPair) {
176        self.0.push(other);
177    }
178
179    /// Return segment ID from a node index.
180    pub fn node_index_to_seg_id(&self, node_index: NodeIndex) -> Result<usize> {
181        let seg_id = &self
182            .0
183            .iter()
184            .find(|e| e.node_index == node_index)
185            .with_context(|| {
186                format!(
187                    "Node index {:?} could not be converted to segment ID",
188                    node_index
189                )
190            })?
191            .seg_id;
192
193        Ok(*seg_id)
194    }
195    /// Return a node index from a segment ID.
196    pub fn seg_id_to_node_index(&self, seg_id: usize) -> Result<NodeIndex> {
197        let node_index = &self
198            .0
199            .iter()
200            .find(|e| e.seg_id == seg_id)
201            .with_context(|| {
202                format!(
203                    "Segment ID {:?} could not be converted to NodeIndex",
204                    seg_id
205                )
206            })?
207            .node_index;
208
209        Ok(*node_index)
210    }
211}
212
213impl fmt::Display for GFAGraphLookups {
214    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
215        let mut output = String::new();
216        output += "\n\tSegment ID's:\n\t";
217
218        let mut seg_ids: String = self
219            .0
220            .iter()
221            .map(|pair| pair.seg_id.to_string() + ", ")
222            .collect();
223        seg_ids.drain(seg_ids.len() - 2..);
224
225        output += &seg_ids;
226
227        writeln!(f, "{}", output)
228    }
229}
230
231#[cfg(test)]
232mod tests {
233
234    use super::*;
235
236    #[test]
237    fn test_node_seg_id_indexes() {
238        let mut gl = GFAGraphLookups::new();
239        // just add two pairs
240        gl.push(GFAGraphPair {
241            node_index: NodeIndex::new(1),
242            seg_id: 12,
243        });
244
245        gl.push(GFAGraphPair {
246            node_index: NodeIndex::new(2),
247            seg_id: 10,
248        });
249
250        assert_eq!(12, gl.node_index_to_seg_id(NodeIndex::new(1)).unwrap());
251        assert_eq!(NodeIndex::new(2), gl.seg_id_to_node_index(10).unwrap());
252    }
253
254    #[test]
255    fn test_gc_content() {
256        let dna_bytes = vec![b'A', b'G', b'G', b'T', b'T', b'C'];
257
258        let gc = gc_content(&dna_bytes);
259
260        assert_eq!(gc, 0.5);
261    }
262
263    #[test]
264    fn test_cigar_parse() {
265        let cigar_ok = "120M".as_bytes();
266        let cigar_err = "30M10D20M5I10M".as_bytes();
267
268        let parsed_cigar = parse_cigar(cigar_ok).is_ok();
269        let parsed_cigar2 = parse_cigar(cigar_err).is_err();
270
271        assert!(parsed_cigar);
272        assert!(parsed_cigar2);
273    }
274}