1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
use std::collections::HashMap;
use std::convert::TryInto;
use std::io::Write;
use std::mem::size_of;
use crate::block::Block;
use crate::error::Error;
use crate::nucleotide::Nucleotide;
use crate::reader::{Field, SIGNATURE};
pub mod fasta;
const FIELD_SIZE: usize = size_of::<Field>();
#[derive(Clone)]
pub struct SequenceLength {
pub name: String,
pub length: usize,
}
impl SequenceLength {
pub fn new<S: ToString>(name: &S, length: usize) -> Self {
Self {
name: name.to_string(),
length,
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.length
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.length == 0
}
}
pub trait SequenceRead<'a> {
fn sequence_lengths(&'a self) -> Result<&[SequenceLength], Box<dyn std::error::Error>>;
fn nucleotides(&self, chr: &str) -> Result<Box<dyn Nucleotides>, Box<dyn std::error::Error>>;
fn soft_masked_blocks(&'a self, chr: &str) -> Result<&'a [Block], Box<dyn std::error::Error>>;
fn hard_masked_blocks(&'a self, chr: &str) -> Result<&'a [Block], Box<dyn std::error::Error>>;
}
pub trait Nucleotides {
fn read_chunk(
&mut self,
buf: &mut [Nucleotide],
) -> std::result::Result<Option<usize>, Box<dyn std::error::Error>>;
}
#[allow(clippy::too_many_lines)]
pub fn to_2bit<'a>(
writer: &mut dyn Write,
extractor: &'a (dyn SequenceRead<'a> + 'a),
) -> Result<(), Box<dyn std::error::Error>> {
let seqs = extractor.sequence_lengths()?;
let signature: Field = SIGNATURE;
let version: Field = 0;
let sequence_count: Field = seqs.len() as Field;
let reserved: Field = 0;
for input in &[&signature, &version, &sequence_count, &reserved] {
writer.write_all(&input.to_ne_bytes())?;
}
let mut index_size: Field = 0;
let mut sequence_records_size: Field = 0;
let mut relative_sequence_offsets = HashMap::<&str, Field>::new();
for seq_length in seqs.iter() {
let chr = &seq_length.name;
if !chr.is_ascii() {
return Err(Box::new(Error::FileFormat(format!(
"Chromosome name is not ASCII: {}",
&chr
))));
}
let addition_to_index: Field = usize2field(1 + chr.len() + FIELD_SIZE)?;
index_size += addition_to_index;
relative_sequence_offsets.insert(chr, sequence_records_size);
let hard_masked_blocks = extractor.hard_masked_blocks(chr)?;
let soft_masked_blocks = extractor.soft_masked_blocks(chr)?;
let length: usize = seq_length.length;
sequence_records_size += {
let block_bytewidth = size_of::<u32>() * 2;
let integer = FIELD_SIZE
+ FIELD_SIZE
+ block_bytewidth * hard_masked_blocks.len()
+ FIELD_SIZE
+ block_bytewidth * soft_masked_blocks.len()
+ FIELD_SIZE;
let condensed_length = length >> 2;
let adjust_partial_quartet = if length % 4 == 0 { 0 } else { 1 };
usize2field(integer + condensed_length + adjust_partial_quartet)?
};
}
let sequence_records_start: Field = 16 + index_size;
for seq_length in seqs.iter() {
let chr = &seq_length.name;
let name = chr.as_bytes();
let name_size: u8 = name.len().try_into().map_err(|_| {
Error::FileFormat(format!(
"chromosome name is longer than 255 characters: {}",
name.len()
))
})?;
let mut buf: Vec<u8> = Vec::with_capacity(2 * FIELD_SIZE + name.len());
buf.push(name_size);
buf.extend_from_slice(name);
let sequence_offset: Field = sequence_records_start
+ *relative_sequence_offsets
.get(chr as &str)
.expect("previous loop");
buf.extend_from_slice(&sequence_offset.to_ne_bytes());
writer.write_all(&buf)?;
}
for seq_length in seqs.iter() {
let chr = &seq_length.name;
let length = seq_length.length;
let length_field = (length as Field).to_ne_bytes();
writer.write_all(&length_field)?;
write_blocks(writer, extractor.hard_masked_blocks(chr)?)?;
write_blocks(writer, extractor.soft_masked_blocks(chr)?)?;
writer.write_all(&reserved.to_ne_bytes())?;
let mut data_buf = [Nucleotide::N; 80];
let mut nuc_modulo: u8 = 0;
let mut byte: u8 = 0;
let mut buf = [0_u8; 1];
let mut sequence_length = 0;
let mut nucleotides = extractor.nucleotides(chr)?;
while let Some(num_nucleotides_read) = nucleotides
.read_chunk(&mut data_buf)
.map_err(|e| Error::FileFormat(e.to_string()))?
{
for (i, nuc) in data_buf.iter().enumerate() {
if i == num_nucleotides_read {
break;
}
byte |= nuc.bits();
nuc_modulo += 1;
if nuc_modulo == 4 {
buf[0] = byte;
writer.write_all(&buf)?;
byte = 0;
nuc_modulo = 0;
} else {
byte <<= 2;
}
sequence_length += 1;
}
}
if nuc_modulo > 0 {
byte <<= 2 * (4 - nuc_modulo);
buf[0] = byte;
writer.write_all(&buf)?;
}
if sequence_length != length {
let msg = format!(
"The reported sequence length of {} for sequence {} is wrong (seen {} nucleotides)",
length, chr, sequence_length,
);
return Err(Box::new(Error::FileFormat(msg)));
}
}
Ok(())
}
fn write_blocks(writer: &mut dyn Write, blocks: &[Block]) -> Result<(), Error> {
let num_blocks: Field = usize2field(blocks.len())?;
writer.write_all(&num_blocks.to_ne_bytes())?;
let mut block_lengths = Vec::with_capacity(&blocks.len() * 4);
for block in blocks {
let start: Field = usize2field(block.start)?;
writer.write_all(&start.to_ne_bytes())?;
let distance: Field = {
if let Some(diff) = block.end.checked_sub(block.start) {
usize2field(diff)?
} else {
return Err(Error::FileFormat("Block end < Block start".to_string()));
}
};
block_lengths.extend_from_slice(&distance.to_ne_bytes());
}
writer.write_all(&block_lengths)?;
Ok(())
}
fn usize2field(v: usize) -> Result<Field, Error> {
v.try_into()
.map_err(|_| Error::FileFormat(format!("Number {} is too big to store in a 2bit Field", v)))
}