1#[allow(non_snake_case)]
8pub mod FastX
9{
10 use flate2::read::MultiGzDecoder;
11 use std::ffi::OsStr;
12 use std::io;
13 use std::io::BufRead;
14
15 const PER_THREAD_BUF_SIZE: usize = 600 * 1024 * 1024;
16
17 pub enum FastXFormat
18 {
19 FASTQ,
20 FASTA,
21 EOF,
22 UNKNOWN,
23 }
24
25 #[derive(Default)]
26 pub struct FastARecord
27 {
28 pub name: String,
29 pub raw_seq: Vec<u8>,
30 }
31
32 #[derive(Default)]
33 pub struct FastQRecord
34 {
35 name: String,
36 seq: Vec<u8>,
37 comment: String,
38 qual: Vec<u8>,
39 }
40
41 pub trait FastXRead: std::fmt::Display
42 {
43 fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>;
44 fn name(&self) -> &String;
45 fn id(&self) -> &str;
46 fn desc(&self) -> &str;
47 fn seq_raw(&self) -> &Vec<u8>;
48 fn seq(&self) -> Vec<u8>;
49 fn seq_len(&self) -> usize;
50 fn lines(&self) -> Vec<&[u8]>;
51 }
52
53 pub trait FastQRead: FastXRead
54 {
55 fn comment(&self) -> &str;
56 fn qual(&self) -> &Vec<u8>;
57 }
58
59 impl std::fmt::Display for FastARecord
60 {
61 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
62 {
63 write!(f, ">{}\n{}", self.name(), String::from_utf8_lossy(&self.seq()))
64 }
65 }
66
67 impl FastXRead for FastARecord
68 {
69 fn name(&self) -> &String
70 {
71 &self.name
72 }
73
74 fn id(&self) -> &str
75 {
76 match memchr::memchr(b' ', self.name.as_bytes())
77 {
78 None => &self.name,
79 Some(i) => &self.name[..i],
80 }
81 }
82
83 fn desc(&self) -> &str
84 {
85 match memchr::memchr(b' ', self.name.as_bytes())
86 {
87 None => "",
88 Some(i) => &self.name[i + 1..],
89 }
90 }
91
92 fn seq_raw(&self) -> &Vec<u8>
93 {
94 &self.raw_seq
95 }
96
97 fn seq(&self) -> Vec<u8>
98 {
99 let mut seq = vec![0; self.raw_seq.len()];
100 let mut line_start = 0;
101 let mut seq_end = 0;
102 let mut seq_start = 0;
103 memchr::memchr_iter(b'\n', &self.raw_seq).for_each(|line_end| {
104 seq_start = seq_end;
105 seq_end += line_end - line_start;
106 seq[seq_start..seq_end].copy_from_slice(&self.raw_seq[line_start..line_end]);
107 line_start = line_end + 1; });
109 if line_start < self.raw_seq.len()
110 {
111 seq_start = seq_end;
112 seq_end += self.raw_seq.len() - line_start;
113 seq[seq_start..seq_end]
114 .copy_from_slice(&self.raw_seq[line_start..self.raw_seq.len()]);
115 seq.resize(seq_end, 0);
116 }
117 seq
118 }
119
120 fn lines(&self) -> Vec<&[u8]>
121 {
122 let mut line_start = 0;
124 memchr::memchr_iter(b'\n', &self.raw_seq)
125 .map(|line_end| {
126 let line = &self.raw_seq[line_start..line_end];
127 line_start = line_end + 1;
128 line
129 })
130 .collect()
131 }
132
133 fn seq_len(&self) -> usize
134 {
135 let mut line_start = 0;
136 memchr::memchr_iter(b'\n', &self.raw_seq).fold(0, |mut len, line_end| {
137 len += line_end - line_start;
138 line_start = line_end + 1;
139 len
140 }) + self.raw_seq.len()
141 - line_start
142 }
143
144 fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>
145 {
146 self.name.clear();
147 self.raw_seq.clear();
148
149 let mut size = 0;
150
151 let mut record_sep = [0_u8];
152 match reader.read(&mut record_sep)
153 {
154 Err(e) => return Err(e),
155 Ok(0) => return Ok(0),
156 Ok(some) =>
157 {
158 size += some;
159 if some != 1 && record_sep[0] != b'>'
160 {
161 return Err(io::Error::new(
162 io::ErrorKind::InvalidData,
163 "record_sep does not match '>'",
164 ));
165 }
166 }
167 };
168
169 match reader.read_line(&mut self.name)
170 {
171 Err(e) => return Err(e),
172 Ok(0) => return Ok(0),
173 Ok(some) => size += some,
174 };
175 rstrip_newline_string(&mut self.name);
176
177 match read_until_before(reader, b'>', &mut self.raw_seq)
178 {
179 Err(e) => Err(e),
180 Ok(0) => Ok(0),
181 Ok(some) =>
182 {
183 rstrip_seq(&mut self.raw_seq);
184 Ok(size + some)
185 }
186 }
187 }
188 }
189
190 impl std::fmt::Display for FastQRecord
191 {
192 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
193 {
194 write!(
195 f,
196 "@{}\n{}\n+\n{}",
197 self.name(),
198 String::from_utf8_lossy(&self.seq()),
199 String::from_utf8_lossy(&self.qual())
200 )
201 }
202 }
203
204 impl FastXRead for FastQRecord
205 {
206 fn name(&self) -> &String
207 {
208 &self.name
209 }
210
211 fn id(&self) -> &str
212 {
213 match memchr::memchr(b' ', self.name.as_bytes())
214 {
215 None => &self.name[1..],
216 Some(i) => &self.name[1..i],
217 }
218 }
219
220 fn desc(&self) -> &str
221 {
222 match memchr::memchr(b' ', self.name.as_bytes())
223 {
224 None => "",
225 Some(i) => &self.name[i + 1..],
226 }
227 }
228
229 fn seq_raw(&self) -> &Vec<u8>
230 {
231 &self.seq
232 }
233
234 fn seq(&self) -> Vec<u8>
236 {
237 self.seq.clone()
238 }
239
240 fn seq_len(&self) -> usize
241 {
242 self.seq
243 .split(|c| *c == b'\n')
244 .fold(0, |len, seq| len + seq.len())
245 }
246
247 fn lines(&self) -> Vec<&[u8]>
248 {
249 self.seq.split(|c| *c == b'\n').collect()
250 }
251
252 fn read(&mut self, reader: &mut dyn BufRead) -> io::Result<usize>
253 {
254 self.name.clear();
255 let mut size;
256 match reader.read_line(&mut self.name)
257 {
258 Err(e) => return Err(e),
259 Ok(0) => return Ok(0),
260 Ok(some) => size = some,
261 }
262 rstrip_newline_string(&mut self.name); assert!(self.name.remove(0) == '@');
264
265 self.seq.clear();
266 match reader.read_until(b'\n', &mut self.seq)
267 {
268 Err(e) => return Err(e),
269 Ok(0) => return Ok(0),
270 Ok(some) =>
271 {
272 rstrip_newline_vec(&mut self.seq);
273 size += some;
274 }
275 }
276
277 self.comment.clear();
278 match reader.read_line(&mut self.comment)
279 {
280 Err(e) => return Err(e),
281 Ok(0) => return Ok(0),
282 Ok(some) =>
283 {
284 rstrip_newline_string(&mut self.comment); size += some
286 }
287 }
288
289 self.qual.clear();
290 match reader.read_until(b'\n', &mut self.qual)
291 {
292 Err(e) => Err(e),
293 Ok(0) => Ok(0),
294 Ok(some) =>
295 {
296 rstrip_newline_vec(&mut self.qual);
297 Ok(size + some)
298 }
299 }
300 }
301 }
302
303 impl FastQRead for FastQRecord
304 {
305 fn comment(&self) -> &str
306 {
307 &self.comment[1..]
308 }
309
310 fn qual(&self) -> &Vec<u8>
311 {
312 &self.qual
313 }
314 }
315
316 fn rstrip_newline_string(s: &mut String)
317 {
318 while s.ends_with(&"\n")
319 {
320 s.truncate(s.len() - 1);
321 }
322 }
323
324 fn rstrip_seq(s: &mut Vec<u8>)
325 {
326 while s[s.len() - 1] == b'>' || s[s.len() - 1] == b'\n'
327 {
329 s.truncate(s.len() - 1);
330 }
331 }
332
333 fn rstrip_newline_vec(s: &mut Vec<u8>)
334 {
335 while s[s.len() - 1] == b'\n'
336 {
337 s.truncate(s.len() - 1);
338 }
339 }
340
341 pub fn peek(reader: &mut dyn BufRead) -> io::Result<(FastXFormat, u8)>
344 {
345 let buf = reader.fill_buf().expect("peek failed");
346 let format = match buf[0] as char
347 {
348 '>' => FastXFormat::FASTA,
349 '@' => FastXFormat::FASTQ,
350 '\0' => FastXFormat::EOF,
351 _ => FastXFormat::UNKNOWN,
352 };
353 if let FastXFormat::UNKNOWN = format
354 {
355 return Err(io::Error::new(
356 io::ErrorKind::InvalidData,
357 "Wrong format expected '>' or '@'!",
358 ));
359 }
360 Ok((format, buf[0]))
361 }
362
363 use std::fs::File;
364 use std::io::BufReader;
365 use std::path::Path;
366 pub fn reader_from_path(path: &Path) -> io::Result<Box<dyn BufRead>>
369 {
370 let file = File::open(path)?;
371 let reader: Box<dyn BufRead> = match path.extension()
372 {
373 Some(extension) if extension == OsStr::new("gz") => Box::new(BufReader::with_capacity(
374 PER_THREAD_BUF_SIZE,
375 MultiGzDecoder::new(BufReader::new(file)),
376 )),
377 _ => Box::new(BufReader::with_capacity(PER_THREAD_BUF_SIZE, file)),
378 };
379 Ok(reader)
380 }
381
382 pub fn from_reader(reader: &mut dyn BufRead) -> io::Result<Box<dyn FastXRead>>
383 {
384 let (format, first) = peek(reader)?;
385 match format
386 {
387 FastXFormat::FASTA => Ok(Box::new(FastARecord::default())),
388 FastXFormat::FASTQ => Ok(Box::new(FastQRecord::default())),
389 FastXFormat::EOF | FastXFormat::UNKNOWN =>
390 {
391 Err(io::Error::new(io::ErrorKind::InvalidData, format!("{:?}", first)))
392 }
393 }
394 }
395
396 fn read_until_before<R: BufRead + ?Sized>(
398 r: &mut R,
399 delim: u8,
400 buf: &mut Vec<u8>,
401 ) -> io::Result<usize>
402 {
403 let mut read = 0;
404 loop
405 {
406 let (done, used) = {
407 let available = match r.fill_buf()
408 {
409 Ok(n) => n,
410 Err(e) => return Err(e),
412 };
413 match memchr::memchr(delim, available)
414 {
415 Some(i) =>
416 {
417 buf.extend_from_slice(&available[..=i]);
418 (true, i + 1)
419 }
420 None =>
421 {
422 buf.extend_from_slice(available);
423 (false, available.len())
424 }
425 }
426 };
427 if done
428 {
429 r.consume(used - 1); read += used - 1;
431 return Ok(read);
432 }
433 else
434 {
435 r.consume(used);
436 read += used;
437 }
438 if used == 0
439 {
440 return Ok(read);
441 }
442 }
443 }
444}
445
446#[cfg(test)]
447mod tests
448{
449 use super::FastX::FastARecord;
450 use super::FastX::FastQRecord;
451 use super::FastX::FastXRead;
452 use std::io::BufReader;
453 use std::io::Cursor;
454
455 #[test]
456 fn fasta()
457 {
458 let mut x = BufReader::new(Cursor::new(">a\nAGTC\n>b\nTAGC\nTTTT\n>c\nGCTA"));
459 let mut record = FastARecord::default();
460 let _ = record.read(&mut x);
461 assert_eq!("a", record.name());
462 assert_eq!(b"AGTC".to_vec(), record.seq());
463 assert_eq!(&b"AGTC".to_vec(), record.seq_raw());
464 let _ = record.read(&mut x);
465 assert_eq!("b", record.name());
466 assert_eq!(b"TAGCTTTT".to_vec(), record.seq());
467 assert_eq!(&b"TAGC\nTTTT".to_vec(), record.seq_raw());
468 let _ = record.read(&mut x);
469 assert_eq!("c", record.name());
470 assert_eq!(b"GCTA".to_vec(), record.seq());
471 assert_eq!(&b"GCTA".to_vec(), record.seq_raw());
472 }
473
474 #[test]
475 fn fasta_memchr()
476 {
477 let fasta = b">a\nAGTC\n>b\nTAGC\nTTTT\n>c\nGCTA\n";
478 let mut reader = BufReader::new(Cursor::new(fasta));
479 let mut record = FastARecord::default();
480 let mut output: Vec<u8> = Vec::new();
481 while let Ok(_some @ 1..=usize::MAX) = record.read(&mut reader)
483 {
484 output.push(b'>');
485 output.append(&mut record.name().as_bytes().to_vec());
486 output.push(b'\n');
487 let seq = record.seq_raw();
488 println!("r{}r", String::from_utf8_lossy(seq));
489 let mut offset = 0;
490 memchr::memchr_iter(b'\n', seq)
491 .map(|line_end| {
492 let seq = &seq[offset..line_end];
493 println!("#{}#", String::from_utf8_lossy(seq));
494 offset = line_end + 1;
495 seq
496 })
497 .for_each(|line| {
498 output.append(&mut line.to_vec());
499 output.push(b'\n');
500 });
501 output.append(&mut seq[offset..].to_vec());
502 output.push(b'\n');
503 }
504 println!("{}\n\n{}", String::from_utf8_lossy(fasta), String::from_utf8_lossy(&output));
505 assert_eq!(fasta.to_vec(), output);
506 }
507
508 #[test]
509 fn fastq()
510 {
511 let mut x = BufReader::new(Cursor::new(
512 "@a\nAGTC\n+\n'&'*+\n@b\nTAGCTTTT\n+\n'&'*+'&'*+\n@c\nGCTA\n+\n'&'*+",
513 ));
514 let mut record = FastQRecord::default();
515 let _ = record.read(&mut x);
516 assert_eq!("a", record.name());
517 assert_eq!(b"AGTC".to_vec(), record.seq());
518 assert_eq!(&b"AGTC".to_vec(), record.seq_raw());
519
520 let _ = record.read(&mut x);
521 assert_eq!("b", record.name());
522 assert_eq!(b"TAGCTTTT".to_vec(), record.seq());
523 assert_eq!(&b"TAGCTTTT".to_vec(), record.seq_raw());
524
525 let _ = record.read(&mut x);
526 assert_eq!("c", record.name());
527 assert_eq!(b"GCTA".to_vec(), record.seq());
528 assert_eq!(&b"GCTA".to_vec(), record.seq_raw());
529 }
530}