Skip to main content

cyanea_seq/
paired.rs

1//! Paired-end FASTQ support.
2//!
3//! Types and functions for working with paired-end Illumina reads (R1/R2).
4//! Supports separate files, interleaved files, and conversions between them.
5//!
6//! # Example
7//!
8//! ```no_run
9//! use cyanea_seq::paired::{parse_paired_fastq_files, MateValidation};
10//!
11//! let pairs = parse_paired_fastq_files("reads_R1.fq", "reads_R2.fq", MateValidation::Relaxed)
12//!     .unwrap();
13//! for pair in &pairs {
14//!     println!("{}: R1={} bp, R2={} bp",
15//!         pair.pair_name(),
16//!         pair.r1().sequence().len(),
17//!         pair.r2().sequence().len());
18//! }
19//! ```
20
21use std::fs::File;
22use std::io::{BufWriter, Write};
23use std::path::Path;
24
25use cyanea_core::{Annotated, CyaneaError, Result, Sequence};
26
27use crate::fastq::{parse_fastq_stats, FastqRecord, FastqStats};
28use crate::quality::{PhredEncoding, QualityScores};
29use crate::types::DnaSequence;
30
31// ---------------------------------------------------------------------------
32// Core types
33// ---------------------------------------------------------------------------
34
35/// How to validate that two reads form a proper mate pair.
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub enum MateValidation {
38    /// Names must match after stripping suffixes, and reads must have `/1` and `/2` suffixes.
39    Strict,
40    /// Names must match after stripping any read-number suffix (`/1`, `/2`, `_1`, `_2`).
41    Relaxed,
42    /// No validation — trust file order.
43    None,
44}
45
46/// A paired-end FASTQ record containing R1 (forward) and R2 (reverse) reads.
47#[derive(Debug, Clone)]
48pub struct PairedFastqRecord {
49    r1: FastqRecord,
50    r2: FastqRecord,
51}
52
53impl PairedFastqRecord {
54    /// Create a new paired record with mate validation.
55    pub fn new(r1: FastqRecord, r2: FastqRecord, validation: MateValidation) -> Result<Self> {
56        match validation {
57            MateValidation::Strict => validate_mate_pair_strict(&r1, &r2)?,
58            MateValidation::Relaxed => validate_mate_pair(&r1, &r2)?,
59            MateValidation::None => {}
60        }
61        Ok(Self { r1, r2 })
62    }
63
64    /// Create a new paired record without any validation.
65    pub fn new_unchecked(r1: FastqRecord, r2: FastqRecord) -> Self {
66        Self { r1, r2 }
67    }
68
69    /// The forward (R1) read.
70    pub fn r1(&self) -> &FastqRecord {
71        &self.r1
72    }
73
74    /// The reverse (R2) read.
75    pub fn r2(&self) -> &FastqRecord {
76        &self.r2
77    }
78
79    /// Consume the pair and return both reads.
80    pub fn into_reads(self) -> (FastqRecord, FastqRecord) {
81        (self.r1, self.r2)
82    }
83
84    /// The shared pair name (R1 name with read-number suffix stripped).
85    pub fn pair_name(&self) -> &str {
86        strip_read_suffix(self.r1.name())
87    }
88}
89
90/// Summary statistics for a paired-end FASTQ dataset.
91#[derive(Debug, Clone)]
92pub struct PairedFastqStats {
93    /// Number of read pairs.
94    pub pair_count: u64,
95    /// Statistics for R1 reads.
96    pub r1_stats: FastqStats,
97    /// Statistics for R2 reads.
98    pub r2_stats: FastqStats,
99}
100
101// ---------------------------------------------------------------------------
102// Helpers
103// ---------------------------------------------------------------------------
104
105/// Strip read-number suffixes from a read name.
106///
107/// Recognizes `/1`, `/2`, `_1`, `_2` at the end of the name.
108/// Illumina-style read numbers (e.g. `1:N:0:ATCACG`) are stored in the
109/// description field by the parser, so the name is already clean.
110pub fn strip_read_suffix(name: &str) -> &str {
111    if name.len() >= 2 {
112        let suffix = &name[name.len() - 2..];
113        if suffix == "/1" || suffix == "/2" || suffix == "_1" || suffix == "_2" {
114            return &name[..name.len() - 2];
115        }
116    }
117    name
118}
119
120/// Validate that two reads form a mate pair (relaxed: name prefixes must match).
121pub fn validate_mate_pair(r1: &FastqRecord, r2: &FastqRecord) -> Result<()> {
122    let name1 = strip_read_suffix(r1.name());
123    let name2 = strip_read_suffix(r2.name());
124    if name1 != name2 {
125        return Err(CyaneaError::InvalidInput(format!(
126            "mate pair name mismatch: '{}' vs '{}'",
127            r1.name(),
128            r2.name()
129        )));
130    }
131    Ok(())
132}
133
134/// Validate that two reads form a mate pair (strict: requires `/1` and `/2` suffixes).
135pub fn validate_mate_pair_strict(r1: &FastqRecord, r2: &FastqRecord) -> Result<()> {
136    if !r1.name().ends_with("/1") {
137        return Err(CyaneaError::InvalidInput(format!(
138            "R1 read name '{}' does not end with /1",
139            r1.name()
140        )));
141    }
142    if !r2.name().ends_with("/2") {
143        return Err(CyaneaError::InvalidInput(format!(
144            "R2 read name '{}' does not end with /2",
145            r2.name()
146        )));
147    }
148    validate_mate_pair(r1, r2)
149}
150
151/// Convert raw needletail record fields into a [`FastqRecord`].
152fn record_from_parts(id: &[u8], seq: &[u8], qual: Option<&[u8]>) -> Result<FastqRecord> {
153    let raw_id =
154        std::str::from_utf8(id).map_err(|e| CyaneaError::Parse(e.to_string()))?;
155    let (name, description) = match raw_id.split_once(char::is_whitespace) {
156        Some((n, d)) => (n.to_string(), Some(d.to_string())),
157        None => (raw_id.to_string(), None),
158    };
159    let sequence = DnaSequence::new(seq)?;
160    let qual_bytes =
161        qual.ok_or_else(|| CyaneaError::Parse("missing quality scores".into()))?;
162    let quality = QualityScores::from_ascii(qual_bytes, PhredEncoding::Phred33)?;
163    FastqRecord::new(name, description, sequence, quality)
164}
165
166// ---------------------------------------------------------------------------
167// Parsing
168// ---------------------------------------------------------------------------
169
170/// Parse paired FASTQ files (separate R1 and R2 files) into paired records.
171///
172/// Opens both files with needletail and iterates in lockstep. Returns an error
173/// if one file has more records than the other or if validation fails.
174pub fn parse_paired_fastq_files(
175    r1_path: impl AsRef<Path>,
176    r2_path: impl AsRef<Path>,
177    validation: MateValidation,
178) -> Result<Vec<PairedFastqRecord>> {
179    let r1_empty = std::fs::metadata(r1_path.as_ref())
180        .map(|m| m.len() == 0)
181        .unwrap_or(false);
182    let r2_empty = std::fs::metadata(r2_path.as_ref())
183        .map(|m| m.len() == 0)
184        .unwrap_or(false);
185    if r1_empty && r2_empty {
186        return Ok(Vec::new());
187    }
188    if r1_empty != r2_empty {
189        return Err(CyaneaError::InvalidInput(
190            "one file is empty but the other is not".into(),
191        ));
192    }
193
194    let mut r1_reader = needletail::parse_fastx_file(r1_path.as_ref())
195        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
196    let mut r2_reader = needletail::parse_fastx_file(r2_path.as_ref())
197        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
198
199    let mut pairs = Vec::new();
200    loop {
201        let r1_next = r1_reader.next();
202        let r2_next = r2_reader.next();
203
204        match (r1_next, r2_next) {
205            (Some(r1), Some(r2)) => {
206                let r1 = r1.map_err(|e| CyaneaError::Parse(e.to_string()))?;
207                let r2 = r2.map_err(|e| CyaneaError::Parse(e.to_string()))?;
208                let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
209                let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
210                pairs.push(PairedFastqRecord::new(r1_rec, r2_rec, validation)?);
211            }
212            (None, None) => break,
213            (Some(_), None) => {
214                return Err(CyaneaError::InvalidInput(
215                    "R1 file has more records than R2 file".into(),
216                ));
217            }
218            (None, Some(_)) => {
219                return Err(CyaneaError::InvalidInput(
220                    "R2 file has more records than R1 file".into(),
221                ));
222            }
223        }
224    }
225
226    Ok(pairs)
227}
228
229/// Parse an interleaved FASTQ file (alternating R1/R2 records) into paired records.
230///
231/// Reads two records at a time from a single file. Returns an error on odd
232/// record count or if validation fails.
233pub fn parse_interleaved_fastq(
234    path: impl AsRef<Path>,
235    validation: MateValidation,
236) -> Result<Vec<PairedFastqRecord>> {
237    if std::fs::metadata(path.as_ref())
238        .map(|m| m.len() == 0)
239        .unwrap_or(false)
240    {
241        return Ok(Vec::new());
242    }
243
244    let mut reader = needletail::parse_fastx_file(path.as_ref())
245        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
246
247    let mut pairs = Vec::new();
248    loop {
249        let r1 = match reader.next() {
250            Some(r1) => r1.map_err(|e| CyaneaError::Parse(e.to_string()))?,
251            None => break,
252        };
253        let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
254
255        let r2 = reader
256            .next()
257            .ok_or_else(|| {
258                CyaneaError::InvalidInput(
259                    "odd number of records in interleaved FASTQ".into(),
260                )
261            })?
262            .map_err(|e| CyaneaError::Parse(e.to_string()))?;
263        let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
264
265        pairs.push(PairedFastqRecord::new(r1_rec, r2_rec, validation)?);
266    }
267
268    Ok(pairs)
269}
270
271/// Compute summary statistics for paired FASTQ files without storing records.
272pub fn parse_paired_fastq_stats(
273    r1_path: impl AsRef<Path>,
274    r2_path: impl AsRef<Path>,
275) -> Result<PairedFastqStats> {
276    let r1_stats = parse_fastq_stats(r1_path)?;
277    let r2_stats = parse_fastq_stats(r2_path)?;
278    let pair_count = r1_stats.sequence_count.min(r2_stats.sequence_count);
279    Ok(PairedFastqStats {
280        pair_count,
281        r1_stats,
282        r2_stats,
283    })
284}
285
286// ---------------------------------------------------------------------------
287// Writing
288// ---------------------------------------------------------------------------
289
290/// Write a single FASTQ record to a writer.
291fn write_fastq_record_to(
292    writer: &mut impl Write,
293    record: &FastqRecord,
294    encoding: PhredEncoding,
295) -> Result<()> {
296    writer.write_all(b"@")?;
297    writer.write_all(record.name().as_bytes())?;
298    if let Some(desc) = record.description() {
299        writer.write_all(b" ")?;
300        writer.write_all(desc.as_bytes())?;
301    }
302    writer.write_all(b"\n")?;
303    writer.write_all(record.sequence().as_bytes())?;
304    writer.write_all(b"\n+\n")?;
305    writer.write_all(&record.quality().to_ascii(encoding))?;
306    writer.write_all(b"\n")?;
307    Ok(())
308}
309
310/// Write paired records to separate R1 and R2 FASTQ files.
311pub fn write_paired_fastq(
312    pairs: &[PairedFastqRecord],
313    r1_path: impl AsRef<Path>,
314    r2_path: impl AsRef<Path>,
315    encoding: PhredEncoding,
316) -> Result<()> {
317    let mut r1_writer = BufWriter::new(File::create(r1_path.as_ref())?);
318    let mut r2_writer = BufWriter::new(File::create(r2_path.as_ref())?);
319
320    for pair in pairs {
321        write_fastq_record_to(&mut r1_writer, &pair.r1, encoding)?;
322        write_fastq_record_to(&mut r2_writer, &pair.r2, encoding)?;
323    }
324
325    r1_writer.flush()?;
326    r2_writer.flush()?;
327    Ok(())
328}
329
330/// Write paired records to a single interleaved FASTQ file.
331pub fn write_interleaved_fastq(
332    pairs: &[PairedFastqRecord],
333    path: impl AsRef<Path>,
334    encoding: PhredEncoding,
335) -> Result<()> {
336    let mut writer = BufWriter::new(File::create(path.as_ref())?);
337
338    for pair in pairs {
339        write_fastq_record_to(&mut writer, &pair.r1, encoding)?;
340        write_fastq_record_to(&mut writer, &pair.r2, encoding)?;
341    }
342
343    writer.flush()?;
344    Ok(())
345}
346
347// ---------------------------------------------------------------------------
348// Interleave / Deinterleave (streaming)
349// ---------------------------------------------------------------------------
350
351/// Interleave two separate FASTQ files into one output file.
352///
353/// Streaming — does not load all records into memory. Returns the number of
354/// pairs written.
355pub fn interleave_fastq_files(
356    r1_path: impl AsRef<Path>,
357    r2_path: impl AsRef<Path>,
358    output_path: impl AsRef<Path>,
359    validation: MateValidation,
360) -> Result<u64> {
361    let mut r1_reader = needletail::parse_fastx_file(r1_path.as_ref())
362        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
363    let mut r2_reader = needletail::parse_fastx_file(r2_path.as_ref())
364        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
365    let mut writer = BufWriter::new(File::create(output_path.as_ref())?);
366
367    let encoding = PhredEncoding::Phred33;
368    let mut count = 0u64;
369
370    loop {
371        let r1_next = r1_reader.next();
372        let r2_next = r2_reader.next();
373
374        match (r1_next, r2_next) {
375            (Some(r1), Some(r2)) => {
376                let r1 = r1.map_err(|e| CyaneaError::Parse(e.to_string()))?;
377                let r2 = r2.map_err(|e| CyaneaError::Parse(e.to_string()))?;
378                let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
379                let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
380
381                match validation {
382                    MateValidation::Strict => validate_mate_pair_strict(&r1_rec, &r2_rec)?,
383                    MateValidation::Relaxed => validate_mate_pair(&r1_rec, &r2_rec)?,
384                    MateValidation::None => {}
385                }
386
387                write_fastq_record_to(&mut writer, &r1_rec, encoding)?;
388                write_fastq_record_to(&mut writer, &r2_rec, encoding)?;
389                count += 1;
390            }
391            (None, None) => break,
392            (Some(_), None) => {
393                return Err(CyaneaError::InvalidInput(
394                    "R1 file has more records than R2 file".into(),
395                ));
396            }
397            (None, Some(_)) => {
398                return Err(CyaneaError::InvalidInput(
399                    "R2 file has more records than R1 file".into(),
400                ));
401            }
402        }
403    }
404
405    writer.flush()?;
406    Ok(count)
407}
408
409/// Deinterleave an interleaved FASTQ file into separate R1 and R2 files.
410///
411/// Streaming — does not load all records into memory. Returns the number of
412/// pairs written.
413pub fn deinterleave_fastq_file(
414    input_path: impl AsRef<Path>,
415    r1_path: impl AsRef<Path>,
416    r2_path: impl AsRef<Path>,
417    validation: MateValidation,
418) -> Result<u64> {
419    let mut reader = needletail::parse_fastx_file(input_path.as_ref())
420        .map_err(|e| CyaneaError::Parse(e.to_string()))?;
421    let mut r1_writer = BufWriter::new(File::create(r1_path.as_ref())?);
422    let mut r2_writer = BufWriter::new(File::create(r2_path.as_ref())?);
423
424    let encoding = PhredEncoding::Phred33;
425    let mut count = 0u64;
426
427    loop {
428        let r1 = match reader.next() {
429            Some(r1) => r1.map_err(|e| CyaneaError::Parse(e.to_string()))?,
430            None => break,
431        };
432        let r1_rec = record_from_parts(r1.id(), &r1.seq(), r1.qual())?;
433
434        let r2 = reader
435            .next()
436            .ok_or_else(|| {
437                CyaneaError::InvalidInput(
438                    "odd number of records in interleaved FASTQ".into(),
439                )
440            })?
441            .map_err(|e| CyaneaError::Parse(e.to_string()))?;
442        let r2_rec = record_from_parts(r2.id(), &r2.seq(), r2.qual())?;
443
444        match validation {
445            MateValidation::Strict => validate_mate_pair_strict(&r1_rec, &r2_rec)?,
446            MateValidation::Relaxed => validate_mate_pair(&r1_rec, &r2_rec)?,
447            MateValidation::None => {}
448        }
449
450        write_fastq_record_to(&mut r1_writer, &r1_rec, encoding)?;
451        write_fastq_record_to(&mut r2_writer, &r2_rec, encoding)?;
452        count += 1;
453    }
454
455    r1_writer.flush()?;
456    r2_writer.flush()?;
457    Ok(count)
458}
459
460// ---------------------------------------------------------------------------
461// Tests
462// ---------------------------------------------------------------------------
463
464#[cfg(test)]
465mod tests {
466    use super::*;
467    use std::io::Write;
468    use tempfile::NamedTempFile;
469
470    fn make_record(name: &str, seq: &[u8], quals: &[u8]) -> FastqRecord {
471        let sequence = DnaSequence::new(seq).unwrap();
472        let quality = QualityScores::from_raw(quals.to_vec());
473        FastqRecord::new(name.to_string(), None, sequence, quality).unwrap()
474    }
475
476    fn make_record_with_desc(
477        name: &str,
478        desc: &str,
479        seq: &[u8],
480        quals: &[u8],
481    ) -> FastqRecord {
482        let sequence = DnaSequence::new(seq).unwrap();
483        let quality = QualityScores::from_raw(quals.to_vec());
484        FastqRecord::new(name.to_string(), Some(desc.to_string()), sequence, quality).unwrap()
485    }
486
487    fn write_temp_fastq(records: &[(&str, &str, &str)]) -> NamedTempFile {
488        let mut file = NamedTempFile::new().unwrap();
489        for (name, seq, qual) in records {
490            writeln!(file, "@{}", name).unwrap();
491            writeln!(file, "{}", seq).unwrap();
492            writeln!(file, "+").unwrap();
493            writeln!(file, "{}", qual).unwrap();
494        }
495        file.flush().unwrap();
496        file
497    }
498
499    // --- strip_read_suffix ---
500
501    #[test]
502    fn strip_suffix_slash_1() {
503        assert_eq!(strip_read_suffix("read1/1"), "read1");
504    }
505
506    #[test]
507    fn strip_suffix_slash_2() {
508        assert_eq!(strip_read_suffix("read1/2"), "read1");
509    }
510
511    #[test]
512    fn strip_suffix_underscore_1() {
513        assert_eq!(strip_read_suffix("read1_1"), "read1");
514    }
515
516    #[test]
517    fn strip_suffix_underscore_2() {
518        assert_eq!(strip_read_suffix("read1_2"), "read1");
519    }
520
521    #[test]
522    fn strip_suffix_none() {
523        assert_eq!(strip_read_suffix("read1"), "read1");
524    }
525
526    #[test]
527    fn strip_suffix_illumina_name() {
528        // Illumina names don't have suffixes in the name field
529        assert_eq!(
530            strip_read_suffix("INSTRUMENT:1:2:3:4:5:6"),
531            "INSTRUMENT:1:2:3:4:5:6"
532        );
533    }
534
535    #[test]
536    fn strip_suffix_short_name() {
537        assert_eq!(strip_read_suffix("r"), "r");
538    }
539
540    // --- Mate validation ---
541
542    #[test]
543    fn validate_relaxed_matching_slash() {
544        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
545        let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
546        assert!(validate_mate_pair(&r1, &r2).is_ok());
547    }
548
549    #[test]
550    fn validate_relaxed_matching_underscore() {
551        let r1 = make_record("read1_1", b"ACGT", &[30; 4]);
552        let r2 = make_record("read1_2", b"ACGT", &[30; 4]);
553        assert!(validate_mate_pair(&r1, &r2).is_ok());
554    }
555
556    #[test]
557    fn validate_relaxed_identical_names() {
558        let r1 = make_record("read1", b"ACGT", &[30; 4]);
559        let r2 = make_record("read1", b"ACGT", &[30; 4]);
560        assert!(validate_mate_pair(&r1, &r2).is_ok());
561    }
562
563    #[test]
564    fn validate_relaxed_mismatch() {
565        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
566        let r2 = make_record("read2/2", b"ACGT", &[30; 4]);
567        assert!(validate_mate_pair(&r1, &r2).is_err());
568    }
569
570    #[test]
571    fn validate_strict_valid() {
572        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
573        let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
574        assert!(validate_mate_pair_strict(&r1, &r2).is_ok());
575    }
576
577    #[test]
578    fn validate_strict_missing_suffix_r1() {
579        let r1 = make_record("read1", b"ACGT", &[30; 4]);
580        let r2 = make_record("read1/2", b"ACGT", &[30; 4]);
581        assert!(validate_mate_pair_strict(&r1, &r2).is_err());
582    }
583
584    #[test]
585    fn validate_strict_missing_suffix_r2() {
586        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
587        let r2 = make_record("read1", b"ACGT", &[30; 4]);
588        assert!(validate_mate_pair_strict(&r1, &r2).is_err());
589    }
590
591    #[test]
592    fn validate_strict_wrong_suffix_order() {
593        let r1 = make_record("read1/2", b"ACGT", &[30; 4]);
594        let r2 = make_record("read1/1", b"ACGT", &[30; 4]);
595        assert!(validate_mate_pair_strict(&r1, &r2).is_err());
596    }
597
598    // --- PairedFastqRecord ---
599
600    #[test]
601    fn pair_creation_valid() {
602        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
603        let r2 = make_record("read1/2", b"TGCA", &[25; 4]);
604        let pair = PairedFastqRecord::new(r1, r2, MateValidation::Relaxed).unwrap();
605        assert_eq!(pair.r1().sequence().as_bytes(), b"ACGT");
606        assert_eq!(pair.r2().sequence().as_bytes(), b"TGCA");
607        assert_eq!(pair.pair_name(), "read1");
608    }
609
610    #[test]
611    fn pair_creation_invalid() {
612        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
613        let r2 = make_record("read2/2", b"TGCA", &[25; 4]);
614        assert!(PairedFastqRecord::new(r1, r2, MateValidation::Relaxed).is_err());
615    }
616
617    #[test]
618    fn pair_creation_unchecked() {
619        let r1 = make_record("read1", b"ACGT", &[30; 4]);
620        let r2 = make_record("read2", b"TGCA", &[25; 4]);
621        let pair = PairedFastqRecord::new_unchecked(r1, r2);
622        assert_eq!(pair.r1().name(), "read1");
623        assert_eq!(pair.r2().name(), "read2");
624    }
625
626    #[test]
627    fn pair_into_reads() {
628        let r1 = make_record("read1/1", b"ACGT", &[30; 4]);
629        let r2 = make_record("read1/2", b"TGCA", &[25; 4]);
630        let pair = PairedFastqRecord::new_unchecked(r1, r2);
631        let (r1, r2) = pair.into_reads();
632        assert_eq!(r1.sequence().as_bytes(), b"ACGT");
633        assert_eq!(r2.sequence().as_bytes(), b"TGCA");
634    }
635
636    #[test]
637    fn pair_no_validation() {
638        let r1 = make_record("foo", b"ACGT", &[30; 4]);
639        let r2 = make_record("bar", b"TGCA", &[25; 4]);
640        assert!(PairedFastqRecord::new(r1, r2, MateValidation::None).is_ok());
641    }
642
643    // --- Parse paired files ---
644
645    #[test]
646    fn parse_paired_files_matching() {
647        let r1_file = write_temp_fastq(&[
648            ("read1/1", "ACGTACGT", "IIIIIIII"),
649            ("read2/1", "TGCATGCA", "IIIIIIII"),
650        ]);
651        let r2_file = write_temp_fastq(&[
652            ("read1/2", "GGGGCCCC", "IIIIIIII"),
653            ("read2/2", "AAAATTTT", "IIIIIIII"),
654        ]);
655        let pairs = parse_paired_fastq_files(
656            r1_file.path(),
657            r2_file.path(),
658            MateValidation::Relaxed,
659        )
660        .unwrap();
661        assert_eq!(pairs.len(), 2);
662        assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
663        assert_eq!(pairs[0].r2().sequence().as_bytes(), b"GGGGCCCC");
664        assert_eq!(pairs[1].pair_name(), "read2");
665    }
666
667    #[test]
668    fn parse_paired_files_unequal_r1_longer() {
669        let r1_file = write_temp_fastq(&[
670            ("read1/1", "ACGTACGT", "IIIIIIII"),
671            ("read2/1", "TGCATGCA", "IIIIIIII"),
672        ]);
673        let r2_file = write_temp_fastq(&[("read1/2", "GGGGCCCC", "IIIIIIII")]);
674        let result = parse_paired_fastq_files(
675            r1_file.path(),
676            r2_file.path(),
677            MateValidation::Relaxed,
678        );
679        assert!(result.is_err());
680    }
681
682    #[test]
683    fn parse_paired_files_unequal_r2_longer() {
684        let r1_file = write_temp_fastq(&[("read1/1", "ACGTACGT", "IIIIIIII")]);
685        let r2_file = write_temp_fastq(&[
686            ("read1/2", "GGGGCCCC", "IIIIIIII"),
687            ("read2/2", "AAAATTTT", "IIIIIIII"),
688        ]);
689        let result = parse_paired_fastq_files(
690            r1_file.path(),
691            r2_file.path(),
692            MateValidation::Relaxed,
693        );
694        assert!(result.is_err());
695    }
696
697    #[test]
698    fn parse_paired_files_no_validation() {
699        let r1_file = write_temp_fastq(&[("foo", "ACGTACGT", "IIIIIIII")]);
700        let r2_file = write_temp_fastq(&[("bar", "GGGGCCCC", "IIIIIIII")]);
701        let pairs = parse_paired_fastq_files(
702            r1_file.path(),
703            r2_file.path(),
704            MateValidation::None,
705        )
706        .unwrap();
707        assert_eq!(pairs.len(), 1);
708    }
709
710    #[test]
711    fn parse_paired_files_validation_failure() {
712        let r1_file = write_temp_fastq(&[("read1/1", "ACGTACGT", "IIIIIIII")]);
713        let r2_file = write_temp_fastq(&[("read2/2", "GGGGCCCC", "IIIIIIII")]);
714        let result = parse_paired_fastq_files(
715            r1_file.path(),
716            r2_file.path(),
717            MateValidation::Relaxed,
718        );
719        assert!(result.is_err());
720    }
721
722    #[test]
723    fn parse_paired_files_strict_validation() {
724        let r1_file = write_temp_fastq(&[("read1/1", "ACGT", "IIII")]);
725        let r2_file = write_temp_fastq(&[("read1/2", "TGCA", "IIII")]);
726        let pairs = parse_paired_fastq_files(
727            r1_file.path(),
728            r2_file.path(),
729            MateValidation::Strict,
730        )
731        .unwrap();
732        assert_eq!(pairs.len(), 1);
733    }
734
735    // --- Parse interleaved ---
736
737    #[test]
738    fn parse_interleaved_valid() {
739        let file = write_temp_fastq(&[
740            ("read1/1", "ACGTACGT", "IIIIIIII"),
741            ("read1/2", "TGCATGCA", "IIIIIIII"),
742            ("read2/1", "GGGGCCCC", "IIIIIIII"),
743            ("read2/2", "AAAATTTT", "IIIIIIII"),
744        ]);
745        let pairs = parse_interleaved_fastq(file.path(), MateValidation::Relaxed).unwrap();
746        assert_eq!(pairs.len(), 2);
747        assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
748        assert_eq!(pairs[0].r2().sequence().as_bytes(), b"TGCATGCA");
749    }
750
751    #[test]
752    fn parse_interleaved_odd_count() {
753        let file = write_temp_fastq(&[
754            ("read1/1", "ACGTACGT", "IIIIIIII"),
755            ("read1/2", "TGCATGCA", "IIIIIIII"),
756            ("read2/1", "GGGGCCCC", "IIIIIIII"),
757        ]);
758        let result = parse_interleaved_fastq(file.path(), MateValidation::None);
759        assert!(result.is_err());
760    }
761
762    // --- Write + re-parse round-trips ---
763
764    #[test]
765    fn write_parse_roundtrip_separate() {
766        let r1 = make_record("read1/1", b"ACGTACGT", &[30; 8]);
767        let r2 = make_record("read1/2", b"TGCATGCA", &[25; 8]);
768        let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
769
770        let r1_file = NamedTempFile::new().unwrap();
771        let r2_file = NamedTempFile::new().unwrap();
772        write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
773            .unwrap();
774
775        let parsed = parse_paired_fastq_files(
776            r1_file.path(),
777            r2_file.path(),
778            MateValidation::None,
779        )
780        .unwrap();
781        assert_eq!(parsed.len(), 1);
782        assert_eq!(parsed[0].r1().sequence().as_bytes(), b"ACGTACGT");
783        assert_eq!(parsed[0].r2().sequence().as_bytes(), b"TGCATGCA");
784        assert_eq!(parsed[0].r1().quality().as_slice(), &[30; 8]);
785        assert_eq!(parsed[0].r2().quality().as_slice(), &[25; 8]);
786    }
787
788    #[test]
789    fn write_parse_roundtrip_interleaved() {
790        let r1 = make_record("read1/1", b"ACGTACGT", &[30; 8]);
791        let r2 = make_record("read1/2", b"TGCATGCA", &[25; 8]);
792        let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
793
794        let file = NamedTempFile::new().unwrap();
795        write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
796
797        let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
798        assert_eq!(parsed.len(), 1);
799        assert_eq!(parsed[0].r1().sequence().as_bytes(), b"ACGTACGT");
800        assert_eq!(parsed[0].r2().sequence().as_bytes(), b"TGCATGCA");
801    }
802
803    #[test]
804    fn write_parse_roundtrip_with_description() {
805        let r1 = make_record_with_desc("read1", "1:N:0:ATCACG", b"ACGTACGT", &[30; 8]);
806        let r2 = make_record_with_desc("read1", "2:N:0:ATCACG", b"TGCATGCA", &[25; 8]);
807        let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
808
809        let file = NamedTempFile::new().unwrap();
810        write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
811
812        let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
813        assert_eq!(parsed.len(), 1);
814        assert_eq!(parsed[0].r1().name(), "read1");
815        assert_eq!(parsed[0].r1().description(), Some("1:N:0:ATCACG"));
816        assert_eq!(parsed[0].r2().description(), Some("2:N:0:ATCACG"));
817    }
818
819    #[test]
820    fn write_parse_roundtrip_multiple_pairs() {
821        let pairs = vec![
822            PairedFastqRecord::new_unchecked(
823                make_record("r1/1", b"AAAA", &[30; 4]),
824                make_record("r1/2", b"CCCC", &[25; 4]),
825            ),
826            PairedFastqRecord::new_unchecked(
827                make_record("r2/1", b"GGGG", &[35; 4]),
828                make_record("r2/2", b"TTTT", &[20; 4]),
829            ),
830        ];
831
832        let r1_file = NamedTempFile::new().unwrap();
833        let r2_file = NamedTempFile::new().unwrap();
834        write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
835            .unwrap();
836
837        let parsed = parse_paired_fastq_files(
838            r1_file.path(),
839            r2_file.path(),
840            MateValidation::None,
841        )
842        .unwrap();
843        assert_eq!(parsed.len(), 2);
844        assert_eq!(parsed[0].r1().sequence().as_bytes(), b"AAAA");
845        assert_eq!(parsed[1].r2().sequence().as_bytes(), b"TTTT");
846    }
847
848    // --- Interleave / deinterleave ---
849
850    #[test]
851    fn interleave_deinterleave_roundtrip() {
852        let r1_file = write_temp_fastq(&[
853            ("read1/1", "ACGTACGT", "IIIIIIII"),
854            ("read2/1", "TGCATGCA", "IIIIIIII"),
855        ]);
856        let r2_file = write_temp_fastq(&[
857            ("read1/2", "GGGGCCCC", "IIIIIIII"),
858            ("read2/2", "AAAATTTT", "IIIIIIII"),
859        ]);
860
861        let interleaved = NamedTempFile::new().unwrap();
862        let count = interleave_fastq_files(
863            r1_file.path(),
864            r2_file.path(),
865            interleaved.path(),
866            MateValidation::Relaxed,
867        )
868        .unwrap();
869        assert_eq!(count, 2);
870
871        let out_r1 = NamedTempFile::new().unwrap();
872        let out_r2 = NamedTempFile::new().unwrap();
873        let count = deinterleave_fastq_file(
874            interleaved.path(),
875            out_r1.path(),
876            out_r2.path(),
877            MateValidation::Relaxed,
878        )
879        .unwrap();
880        assert_eq!(count, 2);
881
882        let pairs = parse_paired_fastq_files(
883            out_r1.path(),
884            out_r2.path(),
885            MateValidation::None,
886        )
887        .unwrap();
888        assert_eq!(pairs.len(), 2);
889        assert_eq!(pairs[0].r1().sequence().as_bytes(), b"ACGTACGT");
890        assert_eq!(pairs[0].r2().sequence().as_bytes(), b"GGGGCCCC");
891    }
892
893    #[test]
894    fn interleave_unequal_files() {
895        let r1_file = write_temp_fastq(&[
896            ("read1/1", "ACGT", "IIII"),
897            ("read2/1", "TGCA", "IIII"),
898        ]);
899        let r2_file = write_temp_fastq(&[("read1/2", "GGGG", "IIII")]);
900        let output = NamedTempFile::new().unwrap();
901        let result = interleave_fastq_files(
902            r1_file.path(),
903            r2_file.path(),
904            output.path(),
905            MateValidation::None,
906        );
907        assert!(result.is_err());
908    }
909
910    // --- Empty / single pair edge cases ---
911
912    #[test]
913    fn parse_paired_empty_files() {
914        let r1_file = NamedTempFile::new().unwrap();
915        let r2_file = NamedTempFile::new().unwrap();
916        let pairs = parse_paired_fastq_files(
917            r1_file.path(),
918            r2_file.path(),
919            MateValidation::None,
920        )
921        .unwrap();
922        assert!(pairs.is_empty());
923    }
924
925    #[test]
926    fn parse_interleaved_empty_file() {
927        let file = NamedTempFile::new().unwrap();
928        let pairs = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
929        assert!(pairs.is_empty());
930    }
931
932    #[test]
933    fn parse_paired_single_pair() {
934        let r1_file = write_temp_fastq(&[("read1/1", "ACGT", "IIII")]);
935        let r2_file = write_temp_fastq(&[("read1/2", "TGCA", "IIII")]);
936        let pairs = parse_paired_fastq_files(
937            r1_file.path(),
938            r2_file.path(),
939            MateValidation::Strict,
940        )
941        .unwrap();
942        assert_eq!(pairs.len(), 1);
943    }
944
945    #[test]
946    fn write_empty_pairs() {
947        let pairs: Vec<PairedFastqRecord> = vec![];
948        let r1_file = NamedTempFile::new().unwrap();
949        let r2_file = NamedTempFile::new().unwrap();
950        write_paired_fastq(&pairs, r1_file.path(), r2_file.path(), PhredEncoding::Phred33)
951            .unwrap();
952        let parsed = parse_paired_fastq_files(
953            r1_file.path(),
954            r2_file.path(),
955            MateValidation::None,
956        )
957        .unwrap();
958        assert!(parsed.is_empty());
959    }
960
961    // --- Stats ---
962
963    #[test]
964    fn paired_stats() {
965        let r1_file = write_temp_fastq(&[
966            ("read1", "ACGTACGT", "IIIIIIII"),
967            ("read2", "TGCA", "IIII"),
968        ]);
969        let r2_file = write_temp_fastq(&[
970            ("read1", "GGGGCCCC", "IIIIIIII"),
971            ("read2", "AAAA", "IIII"),
972        ]);
973        let stats = parse_paired_fastq_stats(r1_file.path(), r2_file.path()).unwrap();
974        assert_eq!(stats.pair_count, 2);
975        assert_eq!(stats.r1_stats.sequence_count, 2);
976        assert_eq!(stats.r2_stats.sequence_count, 2);
977    }
978}
979
980#[cfg(test)]
981mod proptests {
982    use super::*;
983    use crate::types::DnaSequence;
984    use proptest::prelude::*;
985    use tempfile::NamedTempFile;
986
987    fn dna_and_quality(max_len: usize) -> impl Strategy<Value = (Vec<u8>, Vec<u8>)> {
988        (1..=max_len).prop_flat_map(|len| {
989            let seq = proptest::collection::vec(
990                prop_oneof![Just(b'A'), Just(b'C'), Just(b'G'), Just(b'T')],
991                len,
992            );
993            let qual = proptest::collection::vec(0..=41u8, len);
994            (seq, qual)
995        })
996    }
997
998    proptest! {
999        #[test]
1000        fn roundtrip_write_parse(
1001            (seq1, qual1) in dna_and_quality(100),
1002            (seq2, qual2) in dna_and_quality(100),
1003        ) {
1004            let r1 = {
1005                let s = DnaSequence::new(&seq1).unwrap();
1006                let q = QualityScores::from_raw(qual1.clone());
1007                FastqRecord::new("read/1".into(), None, s, q).unwrap()
1008            };
1009            let r2 = {
1010                let s = DnaSequence::new(&seq2).unwrap();
1011                let q = QualityScores::from_raw(qual2.clone());
1012                FastqRecord::new("read/2".into(), None, s, q).unwrap()
1013            };
1014            let pairs = vec![PairedFastqRecord::new_unchecked(r1, r2)];
1015
1016            let file = NamedTempFile::new().unwrap();
1017            write_interleaved_fastq(&pairs, file.path(), PhredEncoding::Phred33).unwrap();
1018
1019            let parsed = parse_interleaved_fastq(file.path(), MateValidation::None).unwrap();
1020            prop_assert_eq!(parsed.len(), 1);
1021            prop_assert_eq!(parsed[0].r1().sequence().as_bytes(), seq1.as_slice());
1022            prop_assert_eq!(parsed[0].r2().sequence().as_bytes(), seq2.as_slice());
1023            prop_assert_eq!(parsed[0].r1().quality().as_slice(), qual1.as_slice());
1024            prop_assert_eq!(parsed[0].r2().quality().as_slice(), qual2.as_slice());
1025        }
1026
1027        #[test]
1028        fn interleave_deinterleave_identity(
1029            (seq1, qual1) in dna_and_quality(50),
1030            (seq2, qual2) in dna_and_quality(50),
1031        ) {
1032            use std::io::Write as _;
1033
1034            let r1_file = NamedTempFile::new().unwrap();
1035            let r2_file = NamedTempFile::new().unwrap();
1036
1037            // Write R1 FASTQ
1038            {
1039                let ascii1: Vec<u8> = qual1.iter().map(|&q| q + 33).collect();
1040                let seq_str = std::str::from_utf8(&seq1).unwrap();
1041                let qual_str = std::str::from_utf8(&ascii1).unwrap();
1042                write!(r1_file.as_file(), "@read/1\n{}\n+\n{}\n", seq_str, qual_str).unwrap();
1043            }
1044            // Write R2 FASTQ
1045            {
1046                let ascii2: Vec<u8> = qual2.iter().map(|&q| q + 33).collect();
1047                let seq_str = std::str::from_utf8(&seq2).unwrap();
1048                let qual_str = std::str::from_utf8(&ascii2).unwrap();
1049                write!(r2_file.as_file(), "@read/2\n{}\n+\n{}\n", seq_str, qual_str).unwrap();
1050            }
1051
1052            let interleaved = NamedTempFile::new().unwrap();
1053            let count = interleave_fastq_files(
1054                r1_file.path(),
1055                r2_file.path(),
1056                interleaved.path(),
1057                MateValidation::None,
1058            )
1059            .unwrap();
1060            prop_assert_eq!(count, 1);
1061
1062            let out_r1 = NamedTempFile::new().unwrap();
1063            let out_r2 = NamedTempFile::new().unwrap();
1064            let count2 = deinterleave_fastq_file(
1065                interleaved.path(),
1066                out_r1.path(),
1067                out_r2.path(),
1068                MateValidation::None,
1069            )
1070            .unwrap();
1071            prop_assert_eq!(count2, 1);
1072
1073            let pairs = parse_paired_fastq_files(
1074                out_r1.path(),
1075                out_r2.path(),
1076                MateValidation::None,
1077            )
1078            .unwrap();
1079            prop_assert_eq!(pairs.len(), 1);
1080            prop_assert_eq!(pairs[0].r1().sequence().as_bytes(), seq1.as_slice());
1081            prop_assert_eq!(pairs[0].r2().sequence().as_bytes(), seq2.as_slice());
1082        }
1083    }
1084}