1use std::fs::File;
22use std::io::{BufWriter, Write};
23use std::path::Path;
24
25use cyanea_core::{Annotated, CyaneaError, Result, Sequence};
26
27use crate::fastq::{parse_fastq_stats, FastqRecord, FastqStats};
28use crate::quality::{PhredEncoding, QualityScores};
29use crate::types::DnaSequence;
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub enum MateValidation {
38 Strict,
40 Relaxed,
42 None,
44}
45
46#[derive(Debug, Clone)]
48pub struct PairedFastqRecord {
49 r1: FastqRecord,
50 r2: FastqRecord,
51}
52
53impl PairedFastqRecord {
54 pub fn new(r1: FastqRecord, r2: FastqRecord, validation: MateValidation) -> Result<Self> {
56 match validation {
57 MateValidation::Strict => validate_mate_pair_strict(&r1, &r2)?,
58 MateValidation::Relaxed => validate_mate_pair(&r1, &r2)?,
59 MateValidation::None => {}
60 }
61 Ok(Self { r1, r2 })
62 }
63
64 pub fn new_unchecked(r1: FastqRecord, r2: FastqRecord) -> Self {
66 Self { r1, r2 }
67 }
68
69 pub fn r1(&self) -> &FastqRecord {
71 &self.r1
72 }
73
74 pub fn r2(&self) -> &FastqRecord {
76 &self.r2
77 }
78
79 pub fn into_reads(self) -> (FastqRecord, FastqRecord) {
81 (self.r1, self.r2)
82 }
83
84 pub fn pair_name(&self) -> &str {
86 strip_read_suffix(self.r1.name())
87 }
88}
89
90#[derive(Debug, Clone)]
92pub struct PairedFastqStats {
93 pub pair_count: u64,
95 pub r1_stats: FastqStats,
97 pub r2_stats: FastqStats,
99}
100
101pub fn strip_read_suffix(name: &str) -> &str {
111 if name.len() >= 2 {
112 let suffix = &name[name.len() - 2..];
113 if suffix == "/1" || suffix == "/2" || suffix == "_1" || suffix == "_2" {
114 return &name[..name.len() - 2];
115 }
116 }
117 name
118}
119
120pub fn validate_mate_pair(r1: &FastqRecord, r2: &FastqRecord) -> Result<()> {
122 let name1 = strip_read_suffix(r1.name());
123 let name2 = strip_read_suffix(r2.name());
124 if name1 != name2 {
125 return Err(CyaneaError::InvalidInput(format!(
126 "mate pair name mismatch: '{}' vs '{}'",
127 r1.name(),
128 r2.name()
129 )));
130 }
131 Ok(())
132}
133
134pub fn validate_mate_pair_strict(r1: &FastqRecord, r2: &FastqRecord) -> Result<()> {
136 if !r1.name().ends_with("/1") {
137 return Err(CyaneaError::InvalidInput(format!(
138 "R1 read name '{}' does not end with /1",
139 r1.name()
140 )));
141 }
142 if !r2.name().ends_with("/2") {
143 return Err(CyaneaError::InvalidInput(format!(
144 "R2 read name '{}' does not end with /2",
145 r2.name()
146 )));
147 }
148 validate_mate_pair(r1, r2)
149}
150
151fn record_from_parts(id: &[u8], seq: &[u8], qual: Option<&[u8]>) -> Result<FastqRecord> {
153 let raw_id =
154 std::str::from_utf8(id).map_err(|e| CyaneaError::Parse(e.to_string()))?;
155 let (name, description) = match raw_id.split_once(char::is_whitespace) {
156 Some((n, d)) => (n.to_string(), Some(d.to_string())),
157 None => (raw_id.to_string(), None),
158 };
159 let sequence = DnaSequence::new(seq)?;
160 let qual_bytes =
161 qual.ok_or_else(|| CyaneaError::Parse("missing quality scores".into()))?;
162 let quality = QualityScores::from_ascii(qual_bytes, PhredEncoding::Phred33)?;
163 FastqRecord::new(name, description, sequence, quality)
164}
165
166pub fn parse_paired_fastq_files(
175 r1_path: impl AsRef<Path>,
176 r2_path: impl AsRef<Path>,
177 validation: MateValidation,
178) -> Result<Vec<PairedFastqRecord>> {
179 let r1_empty = std::fs::metadata(r1_path.as_ref())
180 .map(|m| m.len() == 0)
181 .unwrap_or(false);
182 let r2_empty = std::fs::metadata(r2_path.as_ref())
183 .map(|m| m.len() == 0)
184 .unwrap_or(false);
185 if r1_empty && r2_empty {
186 return Ok(Vec::new());
187 }
188 if r1_empty != r2_empty {
189 return Err(CyaneaError::InvalidInput(
190 "one file is empty but the other is not".into(),
191 ));
192 }
193
194 let mut r1_reader = needletail::parse_fastx_file(r1_path.as_ref())
195 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
196 let mut r2_reader = needletail::parse_fastx_file(r2_path.as_ref())
197 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
198
199 let mut pairs = Vec::new();
200 loop {
201 let r1_next = r1_reader.next();
202 let r2_next = r2_reader.next();
203
204 match (r1_next, r2_next) {
205 (Some(r1), Some(r2)) => {
206 let r1 = r1.map_err(|e| CyaneaError::Parse(e.to_string()))?;
207 let r2 = r2.map_err(|e| CyaneaError::Parse(e.to_string()))?;
208 let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
209 let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
210 pairs.push(PairedFastqRecord::new(r1_rec, r2_rec, validation)?);
211 }
212 (None, None) => break,
213 (Some(_), None) => {
214 return Err(CyaneaError::InvalidInput(
215 "R1 file has more records than R2 file".into(),
216 ));
217 }
218 (None, Some(_)) => {
219 return Err(CyaneaError::InvalidInput(
220 "R2 file has more records than R1 file".into(),
221 ));
222 }
223 }
224 }
225
226 Ok(pairs)
227}
228
229pub fn parse_interleaved_fastq(
234 path: impl AsRef<Path>,
235 validation: MateValidation,
236) -> Result<Vec<PairedFastqRecord>> {
237 if std::fs::metadata(path.as_ref())
238 .map(|m| m.len() == 0)
239 .unwrap_or(false)
240 {
241 return Ok(Vec::new());
242 }
243
244 let mut reader = needletail::parse_fastx_file(path.as_ref())
245 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
246
247 let mut pairs = Vec::new();
248 loop {
249 let r1 = match reader.next() {
250 Some(r1) => r1.map_err(|e| CyaneaError::Parse(e.to_string()))?,
251 None => break,
252 };
253 let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
254
255 let r2 = reader
256 .next()
257 .ok_or_else(|| {
258 CyaneaError::InvalidInput(
259 "odd number of records in interleaved FASTQ".into(),
260 )
261 })?
262 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
263 let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
264
265 pairs.push(PairedFastqRecord::new(r1_rec, r2_rec, validation)?);
266 }
267
268 Ok(pairs)
269}
270
271pub fn parse_paired_fastq_stats(
273 r1_path: impl AsRef<Path>,
274 r2_path: impl AsRef<Path>,
275) -> Result<PairedFastqStats> {
276 let r1_stats = parse_fastq_stats(r1_path)?;
277 let r2_stats = parse_fastq_stats(r2_path)?;
278 let pair_count = r1_stats.sequence_count.min(r2_stats.sequence_count);
279 Ok(PairedFastqStats {
280 pair_count,
281 r1_stats,
282 r2_stats,
283 })
284}
285
286fn write_fastq_record_to(
292 writer: &mut impl Write,
293 record: &FastqRecord,
294 encoding: PhredEncoding,
295) -> Result<()> {
296 writer.write_all(b"@")?;
297 writer.write_all(record.name().as_bytes())?;
298 if let Some(desc) = record.description() {
299 writer.write_all(b" ")?;
300 writer.write_all(desc.as_bytes())?;
301 }
302 writer.write_all(b"\n")?;
303 writer.write_all(record.sequence().as_bytes())?;
304 writer.write_all(b"\n+\n")?;
305 writer.write_all(&record.quality().to_ascii(encoding))?;
306 writer.write_all(b"\n")?;
307 Ok(())
308}
309
310pub fn write_paired_fastq(
312 pairs: &[PairedFastqRecord],
313 r1_path: impl AsRef<Path>,
314 r2_path: impl AsRef<Path>,
315 encoding: PhredEncoding,
316) -> Result<()> {
317 let mut r1_writer = BufWriter::new(File::create(r1_path.as_ref())?);
318 let mut r2_writer = BufWriter::new(File::create(r2_path.as_ref())?);
319
320 for pair in pairs {
321 write_fastq_record_to(&mut r1_writer, &pair.r1, encoding)?;
322 write_fastq_record_to(&mut r2_writer, &pair.r2, encoding)?;
323 }
324
325 r1_writer.flush()?;
326 r2_writer.flush()?;
327 Ok(())
328}
329
330pub fn write_interleaved_fastq(
332 pairs: &[PairedFastqRecord],
333 path: impl AsRef<Path>,
334 encoding: PhredEncoding,
335) -> Result<()> {
336 let mut writer = BufWriter::new(File::create(path.as_ref())?);
337
338 for pair in pairs {
339 write_fastq_record_to(&mut writer, &pair.r1, encoding)?;
340 write_fastq_record_to(&mut writer, &pair.r2, encoding)?;
341 }
342
343 writer.flush()?;
344 Ok(())
345}
346
347pub fn interleave_fastq_files(
356 r1_path: impl AsRef<Path>,
357 r2_path: impl AsRef<Path>,
358 output_path: impl AsRef<Path>,
359 validation: MateValidation,
360) -> Result<u64> {
361 let mut r1_reader = needletail::parse_fastx_file(r1_path.as_ref())
362 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
363 let mut r2_reader = needletail::parse_fastx_file(r2_path.as_ref())
364 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
365 let mut writer = BufWriter::new(File::create(output_path.as_ref())?);
366
367 let encoding = PhredEncoding::Phred33;
368 let mut count = 0u64;
369
370 loop {
371 let r1_next = r1_reader.next();
372 let r2_next = r2_reader.next();
373
374 match (r1_next, r2_next) {
375 (Some(r1), Some(r2)) => {
376 let r1 = r1.map_err(|e| CyaneaError::Parse(e.to_string()))?;
377 let r2 = r2.map_err(|e| CyaneaError::Parse(e.to_string()))?;
378 let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
379 let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
380
381 match validation {
382 MateValidation::Strict => validate_mate_pair_strict(&r1_rec, &r2_rec)?,
383 MateValidation::Relaxed => validate_mate_pair(&r1_rec, &r2_rec)?,
384 MateValidation::None => {}
385 }
386
387 write_fastq_record_to(&mut writer, &r1_rec, encoding)?;
388 write_fastq_record_to(&mut writer, &r2_rec, encoding)?;
389 count += 1;
390 }
391 (None, None) => break,
392 (Some(_), None) => {
393 return Err(CyaneaError::InvalidInput(
394 "R1 file has more records than R2 file".into(),
395 ));
396 }
397 (None, Some(_)) => {
398 return Err(CyaneaError::InvalidInput(
399 "R2 file has more records than R1 file".into(),
400 ));
401 }
402 }
403 }
404
405 writer.flush()?;
406 Ok(count)
407}
408
409pub fn deinterleave_fastq_file(
414 input_path: impl AsRef<Path>,
415 r1_path: impl AsRef<Path>,
416 r2_path: impl AsRef<Path>,
417 validation: MateValidation,
418) -> Result<u64> {
419 let mut reader = needletail::parse_fastx_file(input_path.as_ref())
420 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
421 let mut r1_writer = BufWriter::new(File::create(r1_path.as_ref())?);
422 let mut r2_writer = BufWriter::new(File::create(r2_path.as_ref())?);
423
424 let encoding = PhredEncoding::Phred33;
425 let mut count = 0u64;
426
427 loop {
428 let r1 = match reader.next() {
429 Some(r1) => r1.map_err(|e| CyaneaError::Parse(e.to_string()))?,
430 None => break,
431 };
432 let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
433
434 let r2 = reader
435 .next()
436 .ok_or_else(|| {
437 CyaneaError::InvalidInput(
438 "odd number of records in interleaved FASTQ".into(),
439 )
440 })?
441 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
442 let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
443
444 match validation {
445 MateValidation::Strict => validate_mate_pair_strict(&r1_rec, &r2_rec)?,
446 MateValidation::Relaxed => validate_mate_pair(&r1_rec, &r2_rec)?,
447 MateValidation::None => {}
448 }
449
450 write_fastq_record_to(&mut r1_writer, &r1_rec, encoding)?;
451 write_fastq_record_to(&mut r2_writer, &r2_rec, encoding)?;
452 count += 1;
453 }
454
455 r1_writer.flush()?;
456 r2_writer.flush()?;
457 Ok(count)
458}
459
460#[cfg(test)]
465mod tests {
466 use super::*;
467 use std::io::Write;
468 use tempfile::NamedTempFile;
469
470 fn make_record(name: &str, seq: &[u8], quals: &[u8]) -> FastqRecord {
471 let sequence = DnaSequence::new(seq).unwrap();
472 let quality = QualityScores::from_raw(quals.to_vec());
473 FastqRecord::new(name.to_string(), None, sequence, quality).unwrap()
474 }
475
476 fn make_record_with_desc(
477 name: &str,
478 desc: &str,
479 seq: &[u8],
480 quals: &[u8],
481 ) -> FastqRecord {
482 let sequence = DnaSequence::new(seq).unwrap();
483 let quality = QualityScores::from_raw(quals.to_vec());
484 FastqRecord::new(name.to_string(), Some(desc.to_string()), sequence, quality).unwrap()
485 }
486
487 fn write_temp_fastq(records: &[(&str, &str, &str)]) -> NamedTempFile {
488 let mut file = NamedTempFile::new().unwrap();
489 for (name, seq, qual) in records {
490 writeln!(file, "@{}", name).unwrap();
491 writeln!(file, "{}", seq).unwrap();
492 writeln!(file, "+").unwrap();
493 writeln!(file, "{}", qual).unwrap();
494 }
495 file.flush().unwrap();
496 file
497 }
498
499 #[test]
502 fn strip_suffix_slash_1() {
503 assert_eq!(strip_read_suffix("read1/1"), "read1");
504 }
505
506 #[test]
507 fn strip_suffix_slash_2() {
508 assert_eq!(strip_read_suffix("read1/2"), "read1");
509 }
510
511 #[test]
512 fn strip_suffix_underscore_1() {
513 assert_eq!(strip_read_suffix("read1_1"), "read1");
514 }
515
516 #[test]
517 fn strip_suffix_underscore_2() {
518 assert_eq!(strip_read_suffix("read1_2"), "read1");
519 }
520
521 #[test]
522 fn strip_suffix_none() {
523 assert_eq!(strip_read_suffix("read1"), "read1");
524 }
525
526 #[test]
527 fn strip_suffix_illumina_name() {
528 assert_eq!(
530 strip_read_suffix("INSTRUMENT:1:2:3:4:5:6"),
531 "INSTRUMENT:1:2:3:4:5:6"
532 );
533 }
534
535 #[test]
536 fn strip_suffix_short_name() {
537 assert_eq!(strip_read_suffix("r"), "r");
538 }
539
540 #[test]
543 fn validate_relaxed_matching_slash() {
544 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
545 let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
546 assert!(validate_mate_pair(&r1, &r2).is_ok());
547 }
548
549 #[test]
550 fn validate_relaxed_matching_underscore() {
551 let r1 = make_record("read1_1", b"ACGT", &[30; 4]);
552 let r2 = make_record("read1_2", b"ACGT", &[30; 4]);
553 assert!(validate_mate_pair(&r1, &r2).is_ok());
554 }
555
556 #[test]
557 fn validate_relaxed_identical_names() {
558 let r1 = make_record("read1", b"ACGT", &[30; 4]);
559 let r2 = make_record("read1", b"ACGT", &[30; 4]);
560 assert!(validate_mate_pair(&r1, &r2).is_ok());
561 }
562
563 #[test]
564 fn validate_relaxed_mismatch() {
565 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
566 let r2 = make_record("read2/2", b"ACGT", &[30; 4]);
567 assert!(validate_mate_pair(&r1, &r2).is_err());
568 }
569
570 #[test]
571 fn validate_strict_valid() {
572 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
573 let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
574 assert!(validate_mate_pair_strict(&r1, &r2).is_ok());
575 }
576
577 #[test]
578 fn validate_strict_missing_suffix_r1() {
579 let r1 = make_record("read1", b"ACGT", &[30; 4]);
580 let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
581 assert!(validate_mate_pair_strict(&r1, &r2).is_err());
582 }
583
584 #[test]
585 fn validate_strict_missing_suffix_r2() {
586 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
587 let r2 = make_record("read1", b"ACGT", &[30; 4]);
588 assert!(validate_mate_pair_strict(&r1, &r2).is_err());
589 }
590
591 #[test]
592 fn validate_strict_wrong_suffix_order() {
593 let r1 = make_record("read1/2", b"ACGT", &[30; 4]);
594 let r2 = make_record("read1/1", b"ACGT", &[30; 4]);
595 assert!(validate_mate_pair_strict(&r1, &r2).is_err());
596 }
597
598 #[test]
601 fn pair_creation_valid() {
602 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
603 let r2 = make_record("read1/2", b"TGCA", &[25; 4]);
604 let pair = PairedFastqRecord::new(r1, r2, MateValidation::Relaxed).unwrap();
605 assert_eq!(pair.r1().sequence().as_bytes(), b"ACGT");
606 assert_eq!(pair.r2().sequence().as_bytes(), b"TGCA");
607 assert_eq!(pair.pair_name(), "read1");
608 }
609
610 #[test]
611 fn pair_creation_invalid() {
612 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
613 let r2 = make_record("read2/2", b"TGCA", &[25; 4]);
614 assert!(PairedFastqRecord::new(r1, r2, MateValidation::Relaxed).is_err());
615 }
616
617 #[test]
618 fn pair_creation_unchecked() {
619 let r1 = make_record("read1", b"ACGT", &[30; 4]);
620 let r2 = make_record("read2", b"TGCA", &[25; 4]);
621 let pair = PairedFastqRecord::new_unchecked(r1, r2);
622 assert_eq!(pair.r1().name(), "read1");
623 assert_eq!(pair.r2().name(), "read2");
624 }
625
626 #[test]
627 fn pair_into_reads() {
628 let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
629 let r2 = make_record("read1/2", b"TGCA", &[25; 4]);
630 let pair = PairedFastqRecord::new_unchecked(r1, r2);
631 let (r1, r2) = pair.into_reads();
632 assert_eq!(r1.sequence().as_bytes(), b"ACGT");
633 assert_eq!(r2.sequence().as_bytes(), b"TGCA");
634 }
635
636 #[test]
637 fn pair_no_validation() {
638 let r1 = make_record("foo", b"ACGT", &[30; 4]);
639 let r2 = make_record("bar", b"TGCA", &[25; 4]);
640 assert!(PairedFastqRecord::new(r1, r2, MateValidation::None).is_ok());
641 }
642
643 #[test]
646 fn parse_paired_files_matching() {
647 let r1_file = write_temp_fastq(&[
648 ("read1/1", "ACGTACGT", "IIIIIIII"),
649 ("read2/1", "TGCATGCA", "IIIIIIII"),
650 ]);
651 let r2_file = write_temp_fastq(&[
652 ("read1/2", "GGGGCCCC", "IIIIIIII"),
653 ("read2/2", "AAAATTTT", "IIIIIIII"),
654 ]);
655 let pairs = parse_paired_fastq_files(
656 r1_file.path(),
657 r2_file.path(),
658 MateValidation::Relaxed,
659 )
660 .unwrap();
661 assert_eq!(pairs.len(), 2);
662 assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
663 assert_eq!(pairs[0].r2().sequence().as_bytes(), b"GGGGCCCC");
664 assert_eq!(pairs[1].pair_name(), "read2");
665 }
666
667 #[test]
668 fn parse_paired_files_unequal_r1_longer() {
669 let r1_file = write_temp_fastq(&[
670 ("read1/1", "ACGTACGT", "IIIIIIII"),
671 ("read2/1", "TGCATGCA", "IIIIIIII"),
672 ]);
673 let r2_file = write_temp_fastq(&[("read1/2", "GGGGCCCC", "IIIIIIII")]);
674 let result = parse_paired_fastq_files(
675 r1_file.path(),
676 r2_file.path(),
677 MateValidation::Relaxed,
678 );
679 assert!(result.is_err());
680 }
681
682 #[test]
683 fn parse_paired_files_unequal_r2_longer() {
684 let r1_file = write_temp_fastq(&[("read1/1", "ACGTACGT", "IIIIIIII")]);
685 let r2_file = write_temp_fastq(&[
686 ("read1/2", "GGGGCCCC", "IIIIIIII"),
687 ("read2/2", "AAAATTTT", "IIIIIIII"),
688 ]);
689 let result = parse_paired_fastq_files(
690 r1_file.path(),
691 r2_file.path(),
692 MateValidation::Relaxed,
693 );
694 assert!(result.is_err());
695 }
696
697 #[test]
698 fn parse_paired_files_no_validation() {
699 let r1_file = write_temp_fastq(&[("foo", "ACGTACGT", "IIIIIIII")]);
700 let r2_file = write_temp_fastq(&[("bar", "GGGGCCCC", "IIIIIIII")]);
701 let pairs = parse_paired_fastq_files(
702 r1_file.path(),
703 r2_file.path(),
704 MateValidation::None,
705 )
706 .unwrap();
707 assert_eq!(pairs.len(), 1);
708 }
709
710 #[test]
711 fn parse_paired_files_validation_failure() {
712 let r1_file = write_temp_fastq(&[("read1/1", "ACGTACGT", "IIIIIIII")]);
713 let r2_file = write_temp_fastq(&[("read2/2", "GGGGCCCC", "IIIIIIII")]);
714 let result = parse_paired_fastq_files(
715 r1_file.path(),
716 r2_file.path(),
717 MateValidation::Relaxed,
718 );
719 assert!(result.is_err());
720 }
721
722 #[test]
723 fn parse_paired_files_strict_validation() {
724 let r1_file = write_temp_fastq(&[("read1/1", "ACGT", "IIII")]);
725 let r2_file = write_temp_fastq(&[("read1/2", "TGCA", "IIII")]);
726 let pairs = parse_paired_fastq_files(
727 r1_file.path(),
728 r2_file.path(),
729 MateValidation::Strict,
730 )
731 .unwrap();
732 assert_eq!(pairs.len(), 1);
733 }
734
735 #[test]
738 fn parse_interleaved_valid() {
739 let file = write_temp_fastq(&[
740 ("read1/1", "ACGTACGT", "IIIIIIII"),
741 ("read1/2", "TGCATGCA", "IIIIIIII"),
742 ("read2/1", "GGGGCCCC", "IIIIIIII"),
743 ("read2/2", "AAAATTTT", "IIIIIIII"),
744 ]);
745 let pairs = parse_interleaved_fastq(file.path(), MateValidation::Relaxed).unwrap();
746 assert_eq!(pairs.len(), 2);
747 assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
748 assert_eq!(pairs[0].r2().sequence().as_bytes(), b"TGCATGCA");
749 }
750
751 #[test]
752 fn parse_interleaved_odd_count() {
753 let file = write_temp_fastq(&[
754 ("read1/1", "ACGTACGT", "IIIIIIII"),
755 ("read1/2", "TGCATGCA", "IIIIIIII"),
756 ("read2/1", "GGGGCCCC", "IIIIIIII"),
757 ]);
758 let result = parse_interleaved_fastq(file.path(), MateValidation::None);
759 assert!(result.is_err());
760 }
761
762 #[test]
765 fn write_parse_roundtrip_separate() {
766 let r1 = make_record("read1/1", b"ACGTACGT", &[30; 8]);
767 let r2 = make_record("read1/2", b"TGCATGCA", &[25; 8]);
768 let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
769
770 let r1_file = NamedTempFile::new().unwrap();
771 let r2_file = NamedTempFile::new().unwrap();
772 write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
773 .unwrap();
774
775 let parsed = parse_paired_fastq_files(
776 r1_file.path(),
777 r2_file.path(),
778 MateValidation::None,
779 )
780 .unwrap();
781 assert_eq!(parsed.len(), 1);
782 assert_eq!(parsed[0].r1().sequence().as_bytes(), b"ACGTACGT");
783 assert_eq!(parsed[0].r2().sequence().as_bytes(), b"TGCATGCA");
784 assert_eq!(parsed[0].r1().quality().as_slice(), &[30; 8]);
785 assert_eq!(parsed[0].r2().quality().as_slice(), &[25; 8]);
786 }
787
788 #[test]
789 fn write_parse_roundtrip_interleaved() {
790 let r1 = make_record("read1/1", b"ACGTACGT", &[30; 8]);
791 let r2 = make_record("read1/2", b"TGCATGCA", &[25; 8]);
792 let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
793
794 let file = NamedTempFile::new().unwrap();
795 write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
796
797 let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
798 assert_eq!(parsed.len(), 1);
799 assert_eq!(parsed[0].r1().sequence().as_bytes(), b"ACGTACGT");
800 assert_eq!(parsed[0].r2().sequence().as_bytes(), b"TGCATGCA");
801 }
802
803 #[test]
804 fn write_parse_roundtrip_with_description() {
805 let r1 = make_record_with_desc("read1", "1:N:0:ATCACG", b"ACGTACGT", &[30; 8]);
806 let r2 = make_record_with_desc("read1", "2:N:0:ATCACG", b"TGCATGCA", &[25; 8]);
807 let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
808
809 let file = NamedTempFile::new().unwrap();
810 write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
811
812 let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
813 assert_eq!(parsed.len(), 1);
814 assert_eq!(parsed[0].r1().name(), "read1");
815 assert_eq!(parsed[0].r1().description(), Some("1:N:0:ATCACG"));
816 assert_eq!(parsed[0].r2().description(), Some("2:N:0:ATCACG"));
817 }
818
819 #[test]
820 fn write_parse_roundtrip_multiple_pairs() {
821 let pairs = vec![
822 PairedFastqRecord::new_unchecked(
823 make_record("r1/1", b"AAAA", &[30; 4]),
824 make_record("r1/2", b"CCCC", &[25; 4]),
825 ),
826 PairedFastqRecord::new_unchecked(
827 make_record("r2/1", b"GGGG", &[35; 4]),
828 make_record("r2/2", b"TTTT", &[20; 4]),
829 ),
830 ];
831
832 let r1_file = NamedTempFile::new().unwrap();
833 let r2_file = NamedTempFile::new().unwrap();
834 write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
835 .unwrap();
836
837 let parsed = parse_paired_fastq_files(
838 r1_file.path(),
839 r2_file.path(),
840 MateValidation::None,
841 )
842 .unwrap();
843 assert_eq!(parsed.len(), 2);
844 assert_eq!(parsed[0].r1().sequence().as_bytes(), b"AAAA");
845 assert_eq!(parsed[1].r2().sequence().as_bytes(), b"TTTT");
846 }
847
848 #[test]
851 fn interleave_deinterleave_roundtrip() {
852 let r1_file = write_temp_fastq(&[
853 ("read1/1", "ACGTACGT", "IIIIIIII"),
854 ("read2/1", "TGCATGCA", "IIIIIIII"),
855 ]);
856 let r2_file = write_temp_fastq(&[
857 ("read1/2", "GGGGCCCC", "IIIIIIII"),
858 ("read2/2", "AAAATTTT", "IIIIIIII"),
859 ]);
860
861 let interleaved = NamedTempFile::new().unwrap();
862 let count = interleave_fastq_files(
863 r1_file.path(),
864 r2_file.path(),
865 interleaved.path(),
866 MateValidation::Relaxed,
867 )
868 .unwrap();
869 assert_eq!(count, 2);
870
871 let out_r1 = NamedTempFile::new().unwrap();
872 let out_r2 = NamedTempFile::new().unwrap();
873 let count = deinterleave_fastq_file(
874 interleaved.path(),
875 out_r1.path(),
876 out_r2.path(),
877 MateValidation::Relaxed,
878 )
879 .unwrap();
880 assert_eq!(count, 2);
881
882 let pairs = parse_paired_fastq_files(
883 out_r1.path(),
884 out_r2.path(),
885 MateValidation::None,
886 )
887 .unwrap();
888 assert_eq!(pairs.len(), 2);
889 assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
890 assert_eq!(pairs[0].r2().sequence().as_bytes(), b"GGGGCCCC");
891 }
892
893 #[test]
894 fn interleave_unequal_files() {
895 let r1_file = write_temp_fastq(&[
896 ("read1/1", "ACGT", "IIII"),
897 ("read2/1", "TGCA", "IIII"),
898 ]);
899 let r2_file = write_temp_fastq(&[("read1/2", "GGGG", "IIII")]);
900 let output = NamedTempFile::new().unwrap();
901 let result = interleave_fastq_files(
902 r1_file.path(),
903 r2_file.path(),
904 output.path(),
905 MateValidation::None,
906 );
907 assert!(result.is_err());
908 }
909
910 #[test]
913 fn parse_paired_empty_files() {
914 let r1_file = NamedTempFile::new().unwrap();
915 let r2_file = NamedTempFile::new().unwrap();
916 let pairs = parse_paired_fastq_files(
917 r1_file.path(),
918 r2_file.path(),
919 MateValidation::None,
920 )
921 .unwrap();
922 assert!(pairs.is_empty());
923 }
924
925 #[test]
926 fn parse_interleaved_empty_file() {
927 let file = NamedTempFile::new().unwrap();
928 let pairs = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
929 assert!(pairs.is_empty());
930 }
931
932 #[test]
933 fn parse_paired_single_pair() {
934 let r1_file = write_temp_fastq(&[("read1/1", "ACGT", "IIII")]);
935 let r2_file = write_temp_fastq(&[("read1/2", "TGCA", "IIII")]);
936 let pairs = parse_paired_fastq_files(
937 r1_file.path(),
938 r2_file.path(),
939 MateValidation::Strict,
940 )
941 .unwrap();
942 assert_eq!(pairs.len(), 1);
943 }
944
945 #[test]
946 fn write_empty_pairs() {
947 let pairs: Vec<PairedFastqRecord> = vec![];
948 let r1_file = NamedTempFile::new().unwrap();
949 let r2_file = NamedTempFile::new().unwrap();
950 write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
951 .unwrap();
952 let parsed = parse_paired_fastq_files(
953 r1_file.path(),
954 r2_file.path(),
955 MateValidation::None,
956 )
957 .unwrap();
958 assert!(parsed.is_empty());
959 }
960
961 #[test]
964 fn paired_stats() {
965 let r1_file = write_temp_fastq(&[
966 ("read1", "ACGTACGT", "IIIIIIII"),
967 ("read2", "TGCA", "IIII"),
968 ]);
969 let r2_file = write_temp_fastq(&[
970 ("read1", "GGGGCCCC", "IIIIIIII"),
971 ("read2", "AAAA", "IIII"),
972 ]);
973 let stats = parse_paired_fastq_stats(r1_file.path(), r2_file.path()).unwrap();
974 assert_eq!(stats.pair_count, 2);
975 assert_eq!(stats.r1_stats.sequence_count, 2);
976 assert_eq!(stats.r2_stats.sequence_count, 2);
977 }
978}
979
980#[cfg(test)]
981mod proptests {
982 use super::*;
983 use crate::types::DnaSequence;
984 use proptest::prelude::*;
985 use tempfile::NamedTempFile;
986
987 fn dna_and_quality(max_len: usize) -> impl Strategy<Value = (Vec<u8>, Vec<u8>)> {
988 (1..=max_len).prop_flat_map(|len| {
989 let seq = proptest::collection::vec(
990 prop_oneof![Just(b'A'), Just(b'C'), Just(b'G'), Just(b'T')],
991 len,
992 );
993 let qual = proptest::collection::vec(0..=41u8, len);
994 (seq, qual)
995 })
996 }
997
998 proptest! {
999 #[test]
1000 fn roundtrip_write_parse(
1001 (seq1, qual1) in dna_and_quality(100),
1002 (seq2, qual2) in dna_and_quality(100),
1003 ) {
1004 let r1 = {
1005 let s = DnaSequence::new(&seq1).unwrap();
1006 let q = QualityScores::from_raw(qual1.clone());
1007 FastqRecord::new("read/1".into(), None, s, q).unwrap()
1008 };
1009 let r2 = {
1010 let s = DnaSequence::new(&seq2).unwrap();
1011 let q = QualityScores::from_raw(qual2.clone());
1012 FastqRecord::new("read/2".into(), None, s, q).unwrap()
1013 };
1014 let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
1015
1016 let file = NamedTempFile::new().unwrap();
1017 write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
1018
1019 let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
1020 prop_assert_eq!(parsed.len(), 1);
1021 prop_assert_eq!(parsed[0].r1().sequence().as_bytes(), seq1.as_slice());
1022 prop_assert_eq!(parsed[0].r2().sequence().as_bytes(), seq2.as_slice());
1023 prop_assert_eq!(parsed[0].r1().quality().as_slice(), qual1.as_slice());
1024 prop_assert_eq!(parsed[0].r2().quality().as_slice(), qual2.as_slice());
1025 }
1026
1027 #[test]
1028 fn interleave_deinterleave_identity(
1029 (seq1, qual1) in dna_and_quality(50),
1030 (seq2, qual2) in dna_and_quality(50),
1031 ) {
1032 use std::io::Write as _;
1033
1034 let r1_file = NamedTempFile::new().unwrap();
1035 let r2_file = NamedTempFile::new().unwrap();
1036
1037 {
1039 let ascii1: Vec<u8> = qual1.iter().map(|&q| q + 33).collect();
1040 let seq_str = std::str::from_utf8(&seq1).unwrap();
1041 let qual_str = std::str::from_utf8(&ascii1).unwrap();
1042 write!(r1_file.as_file(), "@read/1\n{}\n+\n{}\n", seq_str, qual_str).unwrap();
1043 }
1044 {
1046 let ascii2: Vec<u8> = qual2.iter().map(|&q| q + 33).collect();
1047 let seq_str = std::str::from_utf8(&seq2).unwrap();
1048 let qual_str = std::str::from_utf8(&ascii2).unwrap();
1049 write!(r2_file.as_file(), "@read/2\n{}\n+\n{}\n", seq_str, qual_str).unwrap();
1050 }
1051
1052 let interleaved = NamedTempFile::new().unwrap();
1053 let count = interleave_fastq_files(
1054 r1_file.path(),
1055 r2_file.path(),
1056 interleaved.path(),
1057 MateValidation::None,
1058 )
1059 .unwrap();
1060 prop_assert_eq!(count, 1);
1061
1062 let out_r1 = NamedTempFile::new().unwrap();
1063 let out_r2 = NamedTempFile::new().unwrap();
1064 let count2 = deinterleave_fastq_file(
1065 interleaved.path(),
1066 out_r1.path(),
1067 out_r2.path(),
1068 MateValidation::None,
1069 )
1070 .unwrap();
1071 prop_assert_eq!(count2, 1);
1072
1073 let pairs = parse_paired_fastq_files(
1074 out_r1.path(),
1075 out_r2.path(),
1076 MateValidation::None,
1077 )
1078 .unwrap();
1079 prop_assert_eq!(pairs.len(), 1);
1080 prop_assert_eq!(pairs[0].r1().sequence().as_bytes(), seq1.as_slice());
1081 prop_assert_eq!(pairs[0].r2().sequence().as_bytes(), seq2.as_slice());
1082 }
1083 }
1084}