bustools/
utils.rs

1//! Utilities
2use indicatif::{ProgressBar, ProgressStyle};
3use std::collections::HashSet;
4use std::hash::Hash;
5
6/// turning a vector into a HashSet
7pub fn vec2set<T: Eq + Hash>(x: Vec<T>) -> HashSet<T> {
8    x.into_iter().collect::<HashSet<T>>()
9}
10
11/// Encoding a base sequence into int
12pub fn seq_to_int(seq: &str) -> u64 {
13    assert!(seq.len() <= 32); // cant handle longer sequences in a single 64bit integer!
14    let s: String = seq
15        .chars()
16        .map(|x| match x {
17            'A' => '0',
18            'C' => '1',
19            'G' => '2',
20            'T' => '3',
21            c => panic!("unkown seq character {}", c),
22        })
23        .collect();
24    u64::from_str_radix(&s, 4).unwrap()
25}
26
27/// Decoding an int into a base sequence. len of the sequence must be specified
28pub fn int_to_seq(i: u64, seq_len: usize) -> String {
29    let mut q = i;
30    let mut result: Vec<u64> = Vec::with_capacity(seq_len);
31    while q >= 4 {
32        let quotient = q / 4;
33        let remainder = q % 4;
34        result.push(remainder);
35        q = quotient;
36    }
37    result.push(q);
38
39    while result.len() < seq_len {
40        result.push(0);
41    }
42    result.reverse();
43
44    let s: String = result
45        .iter()
46        .map(|x| match x {
47            0 => 'A',
48            1 => 'C',
49            2 => 'G',
50            3 => 'T',
51            c => panic!("unkown seq character {}", c),
52        })
53        // .collect::<String>();
54        .collect();
55    s
56}
57
58/// returns a progress bar instance with standard formatting
59pub fn get_progressbar(total: u64) -> ProgressBar {
60    let bar = ProgressBar::new(total);
61    bar.set_style(
62        ProgressStyle::default_bar()
63            .template("[{elapsed_precise} ETA {eta}] {bar:40.cyan/blue} {pos}/{len} {per_sec}")
64            .unwrap()
65            .progress_chars("##-"),
66    );
67    bar
68}
69
70pub fn get_spinner() -> indicatif::ProgressBar{
71    let bar = indicatif::ProgressBar::new_spinner();
72    bar.set_style(
73        indicatif::ProgressStyle::default_bar()
74            .template("[{elapsed_precise}] {pos} {per_sec}")
75            .unwrap()
76            .progress_chars("##-"),
77    );
78    bar
79}
80
81pub mod argsort {
82    //! A workaround for the unsortable `Vec<f64>` (due to Nan)
83    //! # Example
84    //! allows something like
85    //! ```rust
86    //! # use bustools::utils::argsort::{argsort_float,argmax_float};
87    //! argsort_float(&vec![1.1_f64, -0.1_f64], true);
88    //! argmax_float(&vec![1.0_f64, 10_f64]);
89    //! ```
90    //!
91    //! # References  
92    //! <https://stackoverflow.com/questions/69764050/how-to-get-the-indices-that-would-sort-a-vector-in-rust>
93    //! <https://stackoverflow.com/questions/28247990/how-to-do-a-binary-search-on-a-vec-of-floats>
94
95    use std::cmp::Ordering;
96
97    #[derive(PartialEq, PartialOrd)]
98    struct NonNan(f64);
99
100    impl NonNan {
101        fn new(val: f64) -> Option<NonNan> {
102            if val.is_nan() {
103                None
104            } else {
105                Some(NonNan(val))
106            }
107        }
108    }
109    impl Eq for NonNan {}
110
111    impl Ord for NonNan {
112        fn cmp(&self, other: &NonNan) -> Ordering {
113            self.partial_cmp(other).unwrap()
114        }
115    }
116
117    /// Argsort a `slice[T]`
118    pub fn argsort<T: Ord>(slice: &[T]) -> Vec<usize> {
119        let n = slice.len();
120        let mut keys: Vec<_> = (0..n).collect();
121        keys.sort_by_key(|x| &slice[*x]);
122        keys
123    }
124
125    /// argsort of a f64 vector assuming no NAN (will panic otherwise)
126    pub fn argsort_float(fvec: &Vec<f64>, ascending: bool) -> Vec<usize> {
127        let _fvec: Vec<NonNan> = fvec
128            .iter()
129            .map(|x| NonNan::new(*x).unwrap_or_else(|| panic!("Nan values in {fvec:?}")))
130            .collect();
131
132        let mut fvec_sorted_ix = argsort(&_fvec);
133        if ascending {
134            fvec_sorted_ix.reverse();
135        }
136        fvec_sorted_ix
137    }
138
139    /// argmax of a f64 vector assuming no NAN (will panic otherwise)
140    pub fn argmax_float(fvec: &Vec<f64>) -> (usize, f64) {
141        let ix = argsort_float(fvec, true);
142        let i = ix[0];
143        let value = fvec[i];
144        (i, value)
145    }
146
147    /// argmin of a f64 vector assuming no NAN (will panic otherwise)
148    pub fn argmin_float(fvec: &Vec<f64>) -> (usize, f64) {
149        let ix = argsort_float(fvec, false);
150        let i = ix[0];
151        let value = fvec[i];
152        (i, value)
153    }
154
155    #[test]
156    fn test_argmin() {
157        let v = vec![1.0, -1.0, 10.0, 0.0];
158        assert_eq!(argmin_float(&v), (1, -1.0));
159    }
160
161    #[test]
162    fn test_argmax() {
163        let v = vec![1.0, -1.0, 10.0, 0.0];
164        assert_eq!(argmax_float(&v), (2, 10.0));
165    }
166}
167
168/// return the minimum value and all indices that have the min value
169pub fn min_argmin<T: Ord+ Copy>(x: &[T]) -> (T, Vec<usize>){
170
171    if x.len() == 1 {
172        (x[0], vec![0])
173    } else {
174        let mut current_min: &T = x.first().expect("x has to contain an element");
175        let mut indices: Vec<usize> = Vec::with_capacity(x.len());
176        indices.push(0);
177
178        for (ix, el) in x.iter().enumerate().skip(1) {
179            match el.cmp(current_min) {
180                std::cmp::Ordering::Less => {
181                    // new min
182                    current_min = el;
183                    indices = vec![ix]
184                },
185                std::cmp::Ordering::Equal => {
186                    // found another one thats equally small
187                    indices.push(ix)
188                },
189                std::cmp::Ordering::Greater => { /* nothing to do here */},
190            }
191        }
192    
193        (*current_min, indices)
194    }
195}
196#[test]
197fn test_min_argmin() {
198
199    // basic
200    let x = vec![4,3,2,1];
201    assert_eq!(
202        min_argmin(&x),
203        (1, vec![3])
204    );
205
206    // multiple mins
207    let x = vec![4,1,2,1];
208    assert_eq!(
209        min_argmin(&x),
210        (1, vec![1,3])
211    );
212
213    // multiple mins in a 
214    let x = vec![4, 2, 3, 2, 1, 2, 1, 4];
215    assert_eq!(
216        min_argmin(&x),
217        (1, vec![4, 6])
218    );
219}
220
221#[cfg(test)]
222mod tests {
223    #[test]
224    fn encode_seq() {
225        use crate::utils::seq_to_int;
226        assert_eq!(seq_to_int("A"), 0);
227        assert_eq!(seq_to_int("C"), 1);
228        assert_eq!(seq_to_int("G"), 2);
229        assert_eq!(seq_to_int("T"), 3);
230        assert_eq!(seq_to_int("GCCA"), 148);
231    }
232
233    #[test]
234    fn decode_seq() {
235        use crate::utils::int_to_seq;
236
237        //  base order
238        assert_eq!(int_to_seq(0, 1), "A");
239        assert_eq!(int_to_seq(1, 1), "C");
240        assert_eq!(int_to_seq(2, 1), "G");
241        assert_eq!(int_to_seq(3, 1), "T");
242        //
243        // # padding leading A's
244        assert_eq!(int_to_seq(0, 3), "AAA");
245        assert_eq!(int_to_seq(1, 3), "AAC");
246        assert_eq!(int_to_seq(2, 3), "AAG");
247        assert_eq!(int_to_seq(3, 3), "AAT");
248        //
249        assert_eq!(int_to_seq(4, 2), "CA");
250        assert_eq!(int_to_seq(5, 2), "CC");
251        assert_eq!(int_to_seq(6, 2), "CG");
252        assert_eq!(int_to_seq(7, 2), "CT");
253        //
254        assert_eq!(int_to_seq(148, 4), "GCCA");
255        //
256        // # make sure to raise an error when the decoded string is actually longer
257        // # then requested (since its probably a bug in the code calling _decode_int_to_ACGT)
258        // with pytest.raises(AssertionError):
259        // busio._decode_int_to_ACGT(148, seq_len=2)
260        // with pytest.raises(AssertionError):
261        // busio._decode_int_to_ACGT(-1, seq_len=1)
262    }
263}
bustools/utils.rs

bustools/
utils.rs