1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
//! Read merger implementation.
//!
//! This module provides the core algorithm for merging overlapping paired-end reads.
use serde::{Deserialize, Serialize};
use super::MergeConfig;
use crate::io::OwnedRecord;
/// Result of attempting to merge a read pair.
#[derive(Debug, Clone)]
pub enum MergeResult {
/// Successfully merged into a single read.
Merged(OwnedRecord),
/// Could not merge; original reads returned.
Unmerged(OwnedRecord, OwnedRecord),
}
impl MergeResult {
/// Returns true if the pair was successfully merged.
#[inline]
pub fn is_merged(&self) -> bool {
matches!(self, MergeResult::Merged(_))
}
/// Returns true if the pair could not be merged.
#[inline]
pub fn is_unmerged(&self) -> bool {
matches!(self, MergeResult::Unmerged(_, _))
}
}
/// Information about an overlap between R1 and RC(R2).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct OverlapInfo {
/// Offset where R2-RC starts relative to R1.
/// If positive: R2-RC starts at this position in R1.
/// If negative: R1 starts at -offset in R2-RC (R2-RC extends beyond R1 start).
pub offset: i32,
/// Length of the overlap region.
pub overlap_len: usize,
/// Number of mismatches in the overlap.
pub mismatches: usize,
/// Score for this overlap (higher is better).
pub score: i64,
}
impl OverlapInfo {
/// Calculate mismatch ratio.
#[inline]
pub fn mismatch_ratio(&self) -> f64 {
if self.overlap_len == 0 {
1.0
} else {
self.mismatches as f64 / self.overlap_len as f64
}
}
}
/// Statistics for merge operations.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MergeStats {
/// Total number of read pairs attempted.
pub pairs_total: u64,
/// Number of pairs successfully merged.
pub pairs_merged: u64,
/// Number of pairs that could not be merged.
pub pairs_unmerged: u64,
/// Total mismatches corrected (when enabled).
pub mismatches_corrected: u64,
/// Sum of overlap lengths for merged reads.
pub total_overlap_length: u64,
}
impl MergeStats {
/// Create new empty stats.
pub fn new() -> Self {
Self::default()
}
/// Merge statistics from another instance.
pub fn merge(&mut self, other: &MergeStats) {
self.pairs_total += other.pairs_total;
self.pairs_merged += other.pairs_merged;
self.pairs_unmerged += other.pairs_unmerged;
self.mismatches_corrected += other.mismatches_corrected;
self.total_overlap_length += other.total_overlap_length;
}
/// Get merge rate as a percentage.
pub fn merge_rate(&self) -> f64 {
if self.pairs_total == 0 {
0.0
} else {
(self.pairs_merged as f64 / self.pairs_total as f64) * 100.0
}
}
/// Get average overlap length for merged reads.
pub fn avg_overlap_length(&self) -> f64 {
if self.pairs_merged == 0 {
0.0
} else {
self.total_overlap_length as f64 / self.pairs_merged as f64
}
}
}
/// Read merger for overlapping paired-end reads.
#[derive(Debug, Clone)]
pub struct ReadMerger {
config: MergeConfig,
}
impl ReadMerger {
/// Create a new ReadMerger with the given configuration.
pub fn new(config: MergeConfig) -> Self {
Self { config }
}
/// Create a ReadMerger with default configuration.
pub fn default_config() -> Self {
Self::new(MergeConfig::default())
}
/// Get the configuration.
pub fn config(&self) -> &MergeConfig {
&self.config
}
/// Attempt to merge a read pair.
///
/// Returns `MergeResult::Merged` if successful, or `MergeResult::Unmerged`
/// with the original reads if merging is not possible.
pub fn merge(&self, r1: &OwnedRecord, r2: &OwnedRecord) -> MergeResult {
if !self.config.enabled {
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// Quick length check before any allocation
let r1_len = r1.seq.len();
let r2_len = r2.seq.len();
if r1_len < self.config.min_overlap || r2_len < self.config.min_overlap {
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// NOW compute reverse complement of R2
let r2_rc_seq = reverse_complement(&r2.seq);
let r2_rc_qual: Vec<u8> = r2.qual.iter().rev().copied().collect();
// Find best overlap
let overlap = match self.find_best_overlap(&r1.seq, &r1.qual, &r2_rc_seq, &r2_rc_qual) {
Some(o) => o,
None => return MergeResult::Unmerged(r1.clone(), r2.clone()),
};
// Check if overlap meets criteria
if overlap.overlap_len < self.config.min_overlap {
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
if overlap.mismatch_ratio() > self.config.max_mismatch_ratio {
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// Perform the merge
let merged = self.perform_merge(r1, &r2_rc_seq, &r2_rc_qual, &overlap);
MergeResult::Merged(merged)
}
/// Attempt to merge and return statistics.
pub fn merge_with_stats(
&self,
r1: &OwnedRecord,
r2: &OwnedRecord,
stats: &mut MergeStats,
) -> MergeResult {
stats.pairs_total += 1;
if !self.config.enabled {
stats.pairs_unmerged += 1;
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// Quick length check before any allocation
let r1_len = r1.seq.len();
let r2_len = r2.seq.len();
if r1_len < self.config.min_overlap || r2_len < self.config.min_overlap {
stats.pairs_unmerged += 1;
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// NOW compute reverse complement of R2
let r2_rc_seq = reverse_complement(&r2.seq);
let r2_rc_qual: Vec<u8> = r2.qual.iter().rev().copied().collect();
// Find best overlap
let overlap = match self.find_best_overlap(&r1.seq, &r1.qual, &r2_rc_seq, &r2_rc_qual) {
Some(o) => o,
None => {
stats.pairs_unmerged += 1;
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
};
// Check if overlap meets criteria
if overlap.overlap_len < self.config.min_overlap {
stats.pairs_unmerged += 1;
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
if overlap.mismatch_ratio() > self.config.max_mismatch_ratio {
stats.pairs_unmerged += 1;
return MergeResult::Unmerged(r1.clone(), r2.clone());
}
// Perform the merge
let (merged, corrections) = self.perform_merge_counted(r1, &r2_rc_seq, &r2_rc_qual, &overlap);
stats.pairs_merged += 1;
stats.total_overlap_length += overlap.overlap_len as u64;
stats.mismatches_corrected += corrections as u64;
MergeResult::Merged(merged)
}
/// Find the best overlap between R1 and RC(R2).
///
/// Uses a scoring approach that considers:
/// - Match/mismatch counts
/// - Quality scores at each position
fn find_best_overlap(
&self,
r1_seq: &[u8],
r1_qual: &[u8],
r2_rc_seq: &[u8],
r2_rc_qual: &[u8],
) -> Option<OverlapInfo> {
let r1_len = r1_seq.len();
let r2_len = r2_rc_seq.len();
if r1_len == 0 || r2_len == 0 {
return None;
}
let min_overlap = self.config.min_overlap;
let mut best_overlap: Option<OverlapInfo> = None;
// Try different offsets where R2-RC could align with R1.
// The offset represents where R2-RC starts relative to R1.
//
// Case 1: R2-RC starts within R1 (positive offset)
// R1: |--------------------|
// R2-RC: |------------------|
// offset^
//
// Case 2: R2-RC starts before R1 (negative offset)
// R1: |--------------------|
// R2-RC: |------------------|
// ^negative offset
//
// We scan from small overlaps to large overlaps.
// Maximum possible offset: R2-RC starts near the end of R1
// Minimum possible offset: R2-RC starts before R1 such that only min_overlap remains
let max_offset = (r1_len as i32) - (min_overlap as i32);
let min_offset = -((r2_len as i32) - (min_overlap as i32));
for offset in min_offset..=max_offset {
let (r1_start, r2_start, overlap_len) =
calculate_overlap_region(r1_len, r2_len, offset);
if overlap_len < min_overlap {
continue;
}
let (score, mismatches) = self.calculate_overlap_score(
&r1_seq[r1_start..r1_start + overlap_len],
&r1_qual[r1_start..r1_start + overlap_len],
&r2_rc_seq[r2_start..r2_start + overlap_len],
&r2_rc_qual[r2_start..r2_start + overlap_len],
);
let info = OverlapInfo {
offset,
overlap_len,
mismatches,
score,
};
// Check if this is better than current best
match &best_overlap {
None => best_overlap = Some(info),
Some(best) => {
// Prefer higher score, then longer overlap, then fewer mismatches
if info.score > best.score
|| (info.score == best.score && info.overlap_len > best.overlap_len)
|| (info.score == best.score
&& info.overlap_len == best.overlap_len
&& info.mismatches < best.mismatches)
{
best_overlap = Some(info);
}
}
}
}
// Only return overlaps that meet the mismatch ratio requirement
best_overlap.filter(|o| o.mismatch_ratio() <= self.config.max_mismatch_ratio)
}
/// Calculate overlap score considering quality.
///
/// Returns (score, mismatch_count).
/// Score is: sum of (qual1 + qual2) for matches, minus penalty for mismatches.
fn calculate_overlap_score(
&self,
r1_seq: &[u8],
r1_qual: &[u8],
r2_seq: &[u8],
r2_qual: &[u8],
) -> (i64, usize) {
let mut score: i64 = 0;
let mut mismatches = 0;
for i in 0..r1_seq.len() {
let b1 = r1_seq[i];
let b2 = r2_seq[i];
let q1 = r1_qual[i].saturating_sub(33) as i64; // Convert from Phred+33
let q2 = r2_qual[i].saturating_sub(33) as i64;
if bases_match(b1, b2) {
// Match: add quality scores
score += q1 + q2;
} else {
// Mismatch: penalize based on quality
mismatches += 1;
// Penalty proportional to quality of the mismatch
score -= (q1 + q2) / 2;
}
}
(score, mismatches)
}
/// Perform the actual merge of reads given overlap info.
fn perform_merge(
&self,
r1: &OwnedRecord,
r2_rc_seq: &[u8],
r2_rc_qual: &[u8],
overlap: &OverlapInfo,
) -> OwnedRecord {
let (merged, _) = self.perform_merge_counted(r1, r2_rc_seq, r2_rc_qual, overlap);
merged
}
/// Perform merge and count corrections.
fn perform_merge_counted(
&self,
r1: &OwnedRecord,
r2_rc_seq: &[u8],
r2_rc_qual: &[u8],
overlap: &OverlapInfo,
) -> (OwnedRecord, usize) {
let r1_len = r1.seq.len();
let r2_len = r2_rc_seq.len();
let offset = overlap.offset;
let (r1_start, r2_start, overlap_len) =
calculate_overlap_region(r1_len, r2_len, offset);
// Calculate merged length
// The merged read consists of:
// - Part of R1 before overlap (if any)
// - Overlap region (consensus)
// - Part of R2-RC after overlap (if any)
let r1_prefix_len = r1_start;
let r2_suffix_start = r2_start + overlap_len;
let r2_suffix_len = r2_len - r2_suffix_start;
let merged_len = r1_prefix_len + overlap_len + r2_suffix_len;
let mut merged_seq = Vec::with_capacity(merged_len);
let mut merged_qual = Vec::with_capacity(merged_len);
let mut corrections = 0;
// Add R1 prefix (before overlap)
merged_seq.extend_from_slice(&r1.seq[..r1_prefix_len]);
merged_qual.extend_from_slice(&r1.qual[..r1_prefix_len]);
// Add overlap region with consensus
for i in 0..overlap_len {
let r1_idx = r1_start + i;
let r2_idx = r2_start + i;
let b1 = r1.seq[r1_idx];
let b2 = r2_rc_seq[r2_idx];
let q1 = r1.qual[r1_idx];
let q2 = r2_rc_qual[r2_idx];
if bases_match(b1, b2) {
// Bases match: use consensus quality (sum, capped)
merged_seq.push(b1);
// Consensus quality: approximate as max + some bonus
let consensus_q = consensus_quality(q1, q2);
merged_qual.push(consensus_q);
} else {
// Mismatch: choose based on quality if correction enabled
if self.config.correct_mismatches {
let q1_score = q1.saturating_sub(33);
let q2_score = q2.saturating_sub(33);
if q1_score >= q2_score + self.config.quality_diff_threshold {
merged_seq.push(b1);
merged_qual.push(q1);
corrections += 1;
} else if q2_score >= q1_score + self.config.quality_diff_threshold {
merged_seq.push(b2);
merged_qual.push(q2);
corrections += 1;
} else {
// Ambiguous: use the one with higher quality, mark as low quality
if q1_score >= q2_score {
merged_seq.push(b1);
} else {
merged_seq.push(b2);
}
// Reduce quality due to ambiguity
let low_q = q1.min(q2).saturating_sub(10).max(33);
merged_qual.push(low_q);
}
} else {
// No correction: just use R1's base with reduced quality
merged_seq.push(b1);
let low_q = q1.saturating_sub(10).max(33);
merged_qual.push(low_q);
}
}
}
// Add R2-RC suffix (after overlap)
if r2_suffix_len > 0 {
merged_seq.extend_from_slice(&r2_rc_seq[r2_suffix_start..]);
merged_qual.extend_from_slice(&r2_rc_qual[r2_suffix_start..]);
}
// Generate merged name
let mut merged_name = r1.name.clone();
merged_name.extend_from_slice(b":merged");
(
OwnedRecord::new(merged_name, merged_seq, merged_qual),
corrections,
)
}
}
/// Calculate the overlap region given sequence lengths and offset.
///
/// Returns (r1_start, r2_start, overlap_length).
#[inline]
fn calculate_overlap_region(r1_len: usize, r2_len: usize, offset: i32) -> (usize, usize, usize) {
let r1_start: usize;
let r2_start: usize;
let overlap_end_r1: usize;
let overlap_end_r2: usize;
if offset >= 0 {
// R2-RC starts at position `offset` in R1
r1_start = offset as usize;
r2_start = 0;
overlap_end_r1 = r1_len;
overlap_end_r2 = r2_len;
} else {
// R2-RC starts before R1; R1 starts at position `-offset` in R2-RC
r1_start = 0;
r2_start = (-offset) as usize;
overlap_end_r1 = r1_len;
overlap_end_r2 = r2_len;
}
let r1_overlap_len = overlap_end_r1 - r1_start;
let r2_overlap_len = overlap_end_r2 - r2_start;
let overlap_len = r1_overlap_len.min(r2_overlap_len);
(r1_start, r2_start, overlap_len)
}
/// Compute reverse complement of a DNA sequence.
#[inline]
pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| complement_base(b))
.collect()
}
/// Get complement of a single base.
#[inline]
fn complement_base(base: u8) -> u8 {
match base {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'N' | b'n' => b'N',
_ => b'N', // Unknown bases become N
}
}
/// Check if two bases match (considering N as wildcard).
#[inline]
fn bases_match(b1: u8, b2: u8) -> bool {
let b1_upper = b1.to_ascii_uppercase();
let b2_upper = b2.to_ascii_uppercase();
// N matches anything
if b1_upper == b'N' || b2_upper == b'N' {
return true;
}
b1_upper == b2_upper
}
/// Calculate consensus quality for matching bases.
///
/// When bases match, the quality is higher than either individual quality.
/// We use a simple formula: min(max_qual, max(q1, q2) + bonus)
#[inline]
fn consensus_quality(q1: u8, q2: u8) -> u8 {
let max_q = q1.max(q2);
let min_q = q1.min(q2);
// Bonus based on agreement: higher bonus when both qualities are high
// and they agree
let bonus = ((min_q.saturating_sub(33)) / 10).min(5);
// Cap at Phred+33 max (typically 'I' = 73 for Q40, or higher)
max_q.saturating_add(bonus).min(93 + 33) // Max Q60 in Phred+33
}
#[cfg(test)]
mod tests {
use super::*;
fn make_record(name: &[u8], seq: &[u8], qual_scores: &[u8]) -> OwnedRecord {
let qual: Vec<u8> = qual_scores.iter().map(|&q| q + 33).collect();
OwnedRecord::new(name.to_vec(), seq.to_vec(), qual)
}
#[test]
fn test_reverse_complement() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"TTTT"), b"AAAA");
assert_eq!(reverse_complement(b"GCGC"), b"GCGC");
assert_eq!(reverse_complement(b"ATCG"), b"CGAT");
assert_eq!(reverse_complement(b""), b"");
}
#[test]
fn test_complement_base() {
assert_eq!(complement_base(b'A'), b'T');
assert_eq!(complement_base(b'T'), b'A');
assert_eq!(complement_base(b'G'), b'C');
assert_eq!(complement_base(b'C'), b'G');
assert_eq!(complement_base(b'N'), b'N');
assert_eq!(complement_base(b'a'), b'T');
}
#[test]
fn test_bases_match() {
assert!(bases_match(b'A', b'A'));
assert!(bases_match(b'A', b'a'));
assert!(!bases_match(b'A', b'T'));
assert!(bases_match(b'N', b'A'));
assert!(bases_match(b'A', b'N'));
assert!(bases_match(b'N', b'N'));
}
#[test]
fn test_calculate_overlap_region_positive_offset() {
// R1: 100bp, R2: 100bp, offset: 50
// R2-RC starts at position 50 in R1
let (r1_start, r2_start, overlap) = calculate_overlap_region(100, 100, 50);
assert_eq!(r1_start, 50);
assert_eq!(r2_start, 0);
assert_eq!(overlap, 50); // min(100-50, 100-0) = 50
}
#[test]
fn test_calculate_overlap_region_negative_offset() {
// R1: 100bp, R2: 100bp, offset: -20
// R1 starts at position 20 in R2-RC
let (r1_start, r2_start, overlap) = calculate_overlap_region(100, 100, -20);
assert_eq!(r1_start, 0);
assert_eq!(r2_start, 20);
assert_eq!(overlap, 80); // min(100, 100-20) = 80
}
#[test]
fn test_calculate_overlap_region_zero_offset() {
// Complete overlap
let (r1_start, r2_start, overlap) = calculate_overlap_region(100, 100, 0);
assert_eq!(r1_start, 0);
assert_eq!(r2_start, 0);
assert_eq!(overlap, 100);
}
#[test]
fn test_overlap_info_mismatch_ratio() {
let info = OverlapInfo {
offset: 0,
overlap_len: 100,
mismatches: 10,
score: 0,
};
assert!((info.mismatch_ratio() - 0.1).abs() < f64::EPSILON);
}
#[test]
fn test_merge_stats() {
let mut stats = MergeStats::new();
assert_eq!(stats.merge_rate(), 0.0);
stats.pairs_total = 100;
stats.pairs_merged = 80;
stats.pairs_unmerged = 20;
stats.total_overlap_length = 4000;
assert!((stats.merge_rate() - 80.0).abs() < f64::EPSILON);
assert!((stats.avg_overlap_length() - 50.0).abs() < f64::EPSILON);
}
#[test]
fn test_merge_stats_merge() {
let mut stats1 = MergeStats {
pairs_total: 50,
pairs_merged: 40,
pairs_unmerged: 10,
mismatches_corrected: 5,
total_overlap_length: 2000,
};
let stats2 = MergeStats {
pairs_total: 50,
pairs_merged: 30,
pairs_unmerged: 20,
mismatches_corrected: 3,
total_overlap_length: 1500,
};
stats1.merge(&stats2);
assert_eq!(stats1.pairs_total, 100);
assert_eq!(stats1.pairs_merged, 70);
assert_eq!(stats1.pairs_unmerged, 30);
assert_eq!(stats1.mismatches_corrected, 8);
assert_eq!(stats1.total_overlap_length, 3500);
}
#[test]
fn test_merger_disabled() {
let config = MergeConfig::disabled();
let merger = ReadMerger::new(config);
let r1 = make_record(b"r1", b"ACGTACGT", &[30; 8]);
let r2 = make_record(b"r2", b"ACGTACGT", &[30; 8]);
let result = merger.merge(&r1, &r2);
assert!(result.is_unmerged());
}
#[test]
fn test_merger_perfect_overlap() {
// R1 and R2 have perfect overlap
// R1: ACGTACGTACGTACGT (16bp)
// R2: ACGTACGT (8bp) -> RC = ACGTACGT
// If R2-RC starts at position 8 in R1, overlap = 8bp
let config = MergeConfig::enabled().with_min_overlap(8);
let merger = ReadMerger::new(config);
// R1: ACGTACGTACGTACGT
// R2: ACGTACGT (its RC is also ACGTACGT due to palindrome)
// But for a real test, let's use different sequences
//
// R1: AAAACCCCGGGGTTTT (16bp)
// R2 should be such that RC(R2) overlaps with end of R1
//
// If overlap is last 8bp of R1: GGGGTTTT
// Then RC(R2) should start with GGGGTTTT
// So R2 = RC(GGGGTTTT) = AAAACCCC
let r1 = make_record(b"r1", b"AAAACCCCGGGGTTTT", &[30; 16]);
// R2 sequence: needs to have RC that ends with something overlapping R1's end
// Let's make R2 = AAAACCCCGGGG (12bp)
// RC(R2) = CCCCGGGGTTTT... no that's wrong
// Actually: RC of AAAACCCCGGGG = CCCCGGGGTTTT (reverse then complement)
// reverse(AAAACCCCGGGG) = GGGGCCCCAAAA
// complement = CCCCGGGGTTTT
// So RC(R2) = CCCCGGGGTTTT
// Hmm let's think differently:
// R1: AAAACCCCGGGGTTTT
// R2 (as sequenced): to be designed
//
// For overlap, we need R1 end = R2-RC start
// Let's say R1 = AAAACCCCGGGGTTTT (16bp)
// And we want 8bp overlap at the end
// So the overlap region in R1 is: GGGGTTTT
//
// R2-RC should start with GGGGTTTT and extend further
// If R2-RC = GGGGTTTTXXXX where XXXX is new sequence
// Then R2 = RC(GGGGTTTTXXXX) = complement(reverse(GGGGTTTTXXXX))
// reverse(GGGGTTTTXXXX) = XXXXTTTTGGGG
// complement = YYYYAAAACCCC (where Y = complement of X)
//
// For simplicity, let's use:
// R2-RC = GGGGTTTTAAAA (12bp, overlaps 8bp with R1)
// R2 = RC(GGGGTTTTAAAA) = TTTTAAAACCCC
let r2 = make_record(b"r2", b"TTTTAAAACCCC", &[30; 12]);
let result = merger.merge(&r1, &r2);
assert!(result.is_merged());
if let MergeResult::Merged(merged) = result {
// Merged should be: R1 prefix + overlap + R2-RC suffix
// R1 prefix (before overlap): AAAACCCC (8bp)
// Overlap: GGGGTTTT (8bp)
// R2-RC suffix (after overlap): AAAA (4bp)
// Total: AAAACCCCGGGGTTTTAAAA (20bp)
assert_eq!(merged.seq.len(), 20);
assert_eq!(&merged.seq[..8], b"AAAACCCC");
assert_eq!(&merged.seq[8..16], b"GGGGTTTT");
assert_eq!(&merged.seq[16..], b"AAAA");
}
}
#[test]
fn test_merger_no_valid_overlap() {
let config = MergeConfig::enabled().with_min_overlap(30);
let merger = ReadMerger::new(config);
// Reads too short for 30bp overlap
let r1 = make_record(b"r1", b"ACGTACGT", &[30; 8]);
let r2 = make_record(b"r2", b"TGCATGCA", &[30; 8]);
let result = merger.merge(&r1, &r2);
assert!(result.is_unmerged());
}
#[test]
fn test_merger_high_mismatch() {
let config = MergeConfig::enabled()
.with_min_overlap(8)
.with_max_mismatch_ratio(0.05); // Very strict
let merger = ReadMerger::new(config);
// Create reads where the overlap has mismatches
// R1: AAAACCCCGGGGTTTT
// R2 that gives RC with some mismatches in overlap region
let r1 = make_record(b"r1", b"AAAACCCCGGGGTTTT", &[30; 16]);
// R2 designed to have mismatches
let r2 = make_record(b"r2", b"TTTTAAAAGGGG", &[30; 12]); // Different overlap
let result = merger.merge(&r1, &r2);
// With strict mismatch ratio, should not merge
assert!(result.is_unmerged());
}
#[test]
fn test_consensus_quality() {
// Two high quality bases
let q = consensus_quality(73, 73); // Q40 + Q40
assert!(q > 73); // Should be boosted
// One high, one low
let q2 = consensus_quality(73, 40); // Q40 + Q7
assert_eq!(q2, 73 + ((40 - 33) / 10).min(5)); // Max is 73, bonus from min
}
#[test]
fn test_merge_result_helpers() {
let r1 = make_record(b"r1", b"ACGT", &[30; 4]);
let r2 = make_record(b"r2", b"TGCA", &[30; 4]);
let merged_result = MergeResult::Merged(r1.clone());
assert!(merged_result.is_merged());
assert!(!merged_result.is_unmerged());
let unmerged_result = MergeResult::Unmerged(r1, r2);
assert!(!unmerged_result.is_merged());
assert!(unmerged_result.is_unmerged());
}
#[test]
fn test_merger_with_stats() {
let config = MergeConfig::enabled().with_min_overlap(8);
let merger = ReadMerger::new(config);
let mut stats = MergeStats::new();
let r1 = make_record(b"r1", b"AAAACCCCGGGGTTTT", &[30; 16]);
let r2 = make_record(b"r2", b"TTTTAAAACCCC", &[30; 12]);
let result = merger.merge_with_stats(&r1, &r2, &mut stats);
assert!(result.is_merged());
assert_eq!(stats.pairs_total, 1);
assert_eq!(stats.pairs_merged, 1);
assert!(stats.total_overlap_length > 0);
}
}