1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
use std::fmt;
use std::collections::HashMap;

use fastx::FastX::{self, FastQRecord, FastXRead, FastQRead};

extern crate chrono;
use chrono::{DateTime, FixedOffset};

extern crate permutation;


#[allow(non_snake_case)]
pub struct Dataframe
{
    seq_length   : Vec<usize>,
    mean_quality : Vec<f64>,
    kmer_start   : Vec<[u8;4]>,
    kmer_end     : Vec<[u8;4]>,
    ntc_A        : Vec<usize>,
    ntc_G        : Vec<usize>,
    ntc_T        : Vec<usize>,
    ntc_C        : Vec<usize>,
    ntc_U        : Vec<usize>,
    channel      : Vec<usize>,
    start_time   : Vec<DateTime<FixedOffset>>,
}

impl fmt::Display for Dataframe
{
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
    {
        write!(f, "seq_length\tmean_quality\tkmer_start\tkmer_end\tnt_A\tnt_G\tnt_T\tnt_C\tnt_U\tchannels\tstart_times\n")?;
        for i in 0..self.seq_length.len()
        {
            write!(f, "{}\t{:.2}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
                   self.seq_length[i],
                   self.mean_quality[i],
                   String::from_utf8(self.kmer_start[i].to_vec()).expect("kmer_start failed"),
                   String::from_utf8(self.kmer_end[i].to_vec()).expect("kmer_end failed"),
                   self.ntc_A[i],
                   self.ntc_G[i],
                   self.ntc_T[i],
                   self.ntc_C[i],
                   self.ntc_U[i],
                   self.channel[i],
                   self.start_time[i],
                   )?;
        }
        return Ok(());
    }
}

pub fn write_df(df : Dataframe)
{
    print!("{}", df)
}

use std::io;
#[allow(non_snake_case)]
pub fn fastq_df(path: String) -> io::Result<Dataframe>
{
    let mut seq_length   = Vec::new();
    let mut mean_quality = Vec::new();
    let mut kmer_start   = Vec::new();
    let mut kmer_end     = Vec::new();
    let mut ntc_A        = Vec::new();
    let mut ntc_G        = Vec::new();
    let mut ntc_T        = Vec::new();
    let mut ntc_C        = Vec::new();
    let mut ntc_U        = Vec::new();
    let mut channel      = Vec::new();
    let mut start_time   = Vec::new();

    let mut fastx_reader = FastX::reader_from_path(std::path::Path::new(&path))?;
    let mut fastx_record = FastQRecord::default();

    while let Ok(_some @ 1..=usize::MAX) = fastx_record.read(&mut fastx_reader)
    {
        let seq_len = fastx_record.seq_len();

        seq_length.push(seq_len);

        let k = if seq_len < 4 { seq_len } else { 4 };

        let mut kmer : [u8;4] = [0,0,0,0];
        kmer[0..k].copy_from_slice(&fastx_record.seq()[0..k]);
        kmer_start.push(kmer);

        let mut kmer : [u8;4] = [0,0,0,0];
        kmer[0..k].copy_from_slice(&fastx_record.seq()[fastx_record.seq().len()-k..]);
        kmer_end.push(kmer);

        let sum = fastx_record.qual().iter().fold(0.0, |sum, x| sum + (*x - 33) as f64);
        mean_quality.push(sum / seq_len as f64);

        let mut nts : [usize;256] = [0;256];
        for b in fastx_record.seq()
        {
            nts[b as usize] += 1;
        }

        ntc_A.push(nts['A' as usize]);
        ntc_G.push(nts['G' as usize]);
        ntc_T.push(nts['T' as usize]);
        ntc_C.push(nts['C' as usize]);
        ntc_U.push(nts['U' as usize]);

        let info : HashMap<String, String> = fastx_record.desc().split(' ')
            .map(|kv| kv.split('=').collect::<Vec<&str>>())
            .map(|vec| { (vec[0].to_string(), vec[1].to_string()) })
            .collect();

        channel.push(info["ch"].parse().unwrap());
        let st = DateTime::parse_from_rfc3339(&info["start_time"]);
        match st
        {
            Ok(time) => start_time.push(time),
            Err(e) => eprintln!("Error parsing time: {}", e)
        }
    }

    let order = permutation::sort(&start_time[..]);
    let df = Dataframe
    {
        seq_length :   order.apply_slice(&seq_length[..]),
        mean_quality:  order.apply_slice(&mean_quality[..]),
        kmer_start:    order.apply_slice(&kmer_start[..]),
        kmer_end:      order.apply_slice(&kmer_end[..]),
        ntc_A:         order.apply_slice(&ntc_A[..]),
        ntc_G:         order.apply_slice(&ntc_G[..]),
        ntc_T:         order.apply_slice(&ntc_T[..]),
        ntc_C:         order.apply_slice(&ntc_C[..]),
        ntc_U:         order.apply_slice(&ntc_U[..]),
        channel:       order.apply_slice(&channel[..]),
        start_time:    order.apply_slice(&start_time[..])
    };
    return Ok(df);
}


#[cfg(test)]
mod tests
{
    #[test]
    fn it_works()
    {
        assert_eq!(2 + 2, 4);
    }
}