1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
use flate2::write::GzEncoder;
use flate2::Compression;
use needletail::parse_fastx_file;
use serde_json::json;
use sha2::{Digest, Sha256, Sha512};
use std::fs::OpenOptions;
use std::io::{BufWriter, Write};
use std::path::PathBuf;

struct PrepStats {
    num_trimmed: u64,
    num_records: u64,
}

impl PrepStats {
    fn new() -> Self {
        Self {
            num_trimmed: 0,
            num_records: 0,
        }
    }
}

struct Signatures {
    name_hash_256: Sha256,
    seq_hash_256: Sha256,
    name_hash_512: Sha512,
    seq_hash_512: Sha512,
}

impl Signatures {
    fn new() -> Self {
        Self {
            name_hash_256: Sha256::new(),
            seq_hash_256: Sha256::new(),
            name_hash_512: Sha512::new(),
            seq_hash_512: Sha512::new(),
        }
    }

    fn add_name(&mut self, n: &[u8]) {
        self.name_hash_256.update(n);
        self.name_hash_512.update(n);
    }

    fn add_seq(&mut self, s: &[u8]) {
        self.seq_hash_256.update(s);
        self.seq_hash_512.update(s);
    }
}

pub struct RecordParseConfig {
    pub input: Vec<String>,
    pub output_stem: PathBuf,
    pub polya_clip_length: Option<usize>,
}

fn stream_with_clip(
    filenames: &[String],
    poly_a: &[u8],
    output_filename: PathBuf,
    mut stats: PrepStats,
    mut sigs: Signatures,
) -> anyhow::Result<Vec<PathBuf>> {
    let seq_filename = output_filename.with_extension("fa.gz");
    let seq_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(seq_filename.clone())?;

    let seq_file = GzEncoder::new(seq_file, Compression::default());
    let mut seq_file = BufWriter::new(seq_file);

    for filename in filenames.iter() {
        let mut reader = parse_fastx_file(filename).expect("valid path/file");
        while let Some(record) = reader.next() {
            let seqrec = record.expect("invalid record");
            let seqid = seqrec.id();
            let seq = seqrec.seq();
            stats.num_records += 1;
            sigs.add_name(seqid);
            sigs.add_seq(&seq);

            if seq.ends_with(poly_a) {
                let owned = String::from_utf8(seq.into_owned()).unwrap();
                let trimmed = owned.trim_end_matches('A');
                stats.num_trimmed += 1;
                writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
                writeln!(&mut seq_file, "{}", trimmed)?;
            } else {
                writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
                writeln!(&mut seq_file, "{}", String::from_utf8_lossy(&seq))?;
            }
        }
    }

    let sig_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(output_filename.with_extension("json"))?;

    eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
    write_sig_file(sig_file, &stats, sigs)?;
    Ok(vec![seq_filename])
}

fn write_sig_file(
    sig_file: std::fs::File,
    stats: &PrepStats,
    sigs: Signatures,
) -> anyhow::Result<()> {
    let mut sig_file = BufWriter::new(sig_file);

    let v = json!({
        "sha256_names" : hex::encode(sigs.name_hash_256.finalize()),
        "sha512_names" : hex::encode(sigs.name_hash_512.finalize()),
        "sha512_seqs" : hex::encode(sigs.seq_hash_512.finalize()),
        "sha256_seqs" : hex::encode(sigs.seq_hash_256.finalize()),
        "num_records" : stats.num_records,
        "num_trimmed_polya" : stats.num_trimmed
    });

    write!(&mut sig_file, "{}", v)?;
    Ok(())
}

fn stream_for_signatures(
    filenames: &[String],
    output_filename: PathBuf,
    mut stats: PrepStats,
    mut sigs: Signatures,
) -> anyhow::Result<()> {
    for filename in filenames.iter() {
        let mut reader = parse_fastx_file(filename).expect("valid path/file");
        while let Some(record) = reader.next() {
            let seqrec = record.expect("invalid record");
            let seqid = seqrec.id();
            let seq = seqrec.seq();
            stats.num_records += 1;
            sigs.add_name(seqid);
            sigs.add_seq(&seq);
        }
    }

    let sig_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(output_filename.with_extension("json"))?;

    eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
    write_sig_file(sig_file, &stats, sigs)
}

/// Parses the list of input files according to the configuration.
/// If modifications are to be made (e.g. polyA clipping is enabled), then
/// it will return a Vec<PathBuf> containing a single element; the file path 
/// of the modified reference it created. If no alterations are being made 
/// and instead the signatures are just being computed, then it will return 
/// a Vec<PathBuf> containing the paths to the initial input files it was
/// given.
pub fn parse_records(args: RecordParseConfig) -> anyhow::Result<Vec<PathBuf>> {
    let filenames = args.input;
    let clip_polya;
    let poly_a = if let Some(alen) = args.polya_clip_length {
        clip_polya = true;
        vec![b'A'; alen]
    } else {
        clip_polya = false;
        vec![]
    };
    let output_filename = args.output_stem;
    let stats = PrepStats::new();
    let sigs = Signatures::new();

    if clip_polya {
        stream_with_clip(&filenames, &poly_a, output_filename, stats, sigs)
    } else {
        stream_for_signatures(&filenames, output_filename, stats, sigs)?;
        Ok(filenames.iter().map(PathBuf::from).collect())
    }
}