use std::borrow::Cow;
use std::cell::RefCell;
use crate::transformations::prelude::*;
use fastqrab_dna::dna::reverse_complement;
use fastqrab_io::io::WrappedFastQReadMut;
#[derive(Clone, PartialEq, Eq, JsonSchema)]
#[tpd]
#[derive(Debug)]
pub enum Algorithm {
#[tpd(alias = "FastpSeemsWeird")]
Fastp,
}
#[derive(Clone, PartialEq, Eq, JsonSchema)]
#[tpd]
#[derive(Debug)]
pub enum NoOverlapStrategy {
AsIs,
Concatenate,
}
#[derive(Clone, JsonSchema)]
#[tpd]
#[derive(Debug)]
pub struct MergeReads {
pub algorithm: Algorithm,
pub min_overlap: usize,
pub max_mismatch_rate: f64,
pub max_mismatch_count: usize,
pub no_overlap_strategy: NoOverlapStrategy,
pub out_label: Option<TagLabel>,
pub concatenate_spacer: Option<String>,
pub spacer_quality_char: Option<u8>,
pub reverse_complement_segment2: bool,
#[schemars(with = "String")]
#[tpd(adapt_in_verify(String))]
pub segment1: SegmentIndex,
#[schemars(with = "String")]
#[tpd(adapt_in_verify(String))]
pub segment2: SegmentIndex,
}
impl VerifyIn<PartialConfig> for PartialMergeReads {
fn verify(
&mut self,
parent: &PartialConfig,
_options: &VerifyOptions,
) -> std::result::Result<(), ValidationFailure>
where
Self: Sized + toml_pretty_deser::Visitor,
{
self.segment1.validate_segment(parent);
self.segment2.validate_segment(parent);
if let Some(segment1) = self.segment1.as_ref()
&& let Some(segment2) = self.segment2.as_ref()
&& let MustAdapt::PostVerify(index1) = segment1
&& let MustAdapt::PostVerify(index2) = segment2
&& index1 == index2
{
let spans = vec![
(
self.segment1.span(),
"Must be different from segment2".to_string(),
),
(
self.segment2.span(),
"Must be different from segment1".to_string(),
),
];
self.segment1 = TomlValue::new_custom(
Some(MustAdapt::PostVerify(*index1)),
spans,
Some(&format!(
"Available segments: {}",
parent
.input
.as_ref()
.expect("Expected input_def to be present at this point. Bug in config validation")
.get_segment_order()
.join(", ")
)),
);
}
if let Some(no_overlap_strategy) = self.no_overlap_strategy.as_ref()
&& let Some(concatenate_spacer) = self.concatenate_spacer.as_ref()
&& *no_overlap_strategy == NoOverlapStrategy::Concatenate
&& concatenate_spacer.is_none()
{
return Err(ValidationFailure::new(
"Missing key: 'concatenate_spacer'",
Some(
"concatenate_spacer is required when no_overlap_strategy = 'concatenate'. Please specify a spacer sequence (e.g., concatenate_spacer = \"NNNN\").",
),
));
}
self.min_overlap.verify(|v| {
if *v < 5 {
Err(ValidationFailure::new(
"Invalid value. Must be >= 5",
Some("Set a valid value."),
))
} else {
Ok(())
}
});
self.max_mismatch_rate.verify(|v| {
if *v < 0.0 || *v >= 1.0 {
Err(ValidationFailure::new(
"Invalid value. Must be in [0.0..1.0)",
Some("Set a valid value >= 0 and < 1.0."),
))
} else {
Ok(())
}
});
self.spacer_quality_char.verify(|opt_v| {
if let Some(v) = opt_v
&& !(33..=126).contains(v)
{
return Err(ValidationFailure::new(
"Invalid value. Must be in [33..126]",
Some("Set a valid value."),
));
}
Ok(())
});
Ok(())
}
}
impl TagUser for PartialTaggedVariant<PartialMergeReads> {
fn get_tag_usage(
&mut self,
_tags_available: &IndexMap<TagLabel, TagMetadata>,
_segment_order: &[String],
) -> Option<TagUsageInfo<'_>> {
if let Some(inner) = self.toml_value.value.as_mut() {
Some(TagUsageInfo {
declared_tag: inner.out_label.to_declared_tag(TagValueType::Bool),
..Default::default()
})
} else {
None }
}
}
impl Step for MergeReads {
fn apply(
&self,
mut block: FastQBlocksCombined,
_input_info: &InputInfo,
_demultiplex_info: &OptDemultiplex,
) -> anyhow::Result<(FastQBlocksCombined, bool)> {
let seg1_idx = self.segment1.0;
let seg2_idx = self.segment2.0;
let reverse_complement_segment2 = self.reverse_complement_segment2;
let no_overlap_strategy = self.no_overlap_strategy.clone();
let concatenate_spacer = self.concatenate_spacer.clone();
let spacer_qual = self.spacer_quality_char.unwrap_or(33);
let min_overlap = self.min_overlap;
let max_mismatch_rate = self.max_mismatch_rate;
let max_mismatch_count = self.max_mismatch_count;
let algorithm = self.algorithm.clone();
let merge_status: RefCell<Option<Vec<bool>>> = RefCell::new(
self.out_label
.as_ref()
.map(|_| Vec::with_capacity(block.len())),
);
block.apply_mut(|reads: &mut [WrappedFastQReadMut]| {
let read1_seq = reads[seg1_idx as usize].seq();
let read1_qual = reads[seg1_idx as usize].qual();
let read2_seq = reads[seg2_idx as usize].seq();
let read2_qual = reads[seg2_idx as usize].qual();
let (read2_seq_processed, read2_qual_processed): (Cow<[u8]>, Cow<[u8]>) =
if reverse_complement_segment2 {
let rc_seq = reverse_complement(read2_seq);
let rc_qual: Vec<u8> = read2_qual.iter().rev().copied().collect();
(Cow::Owned(rc_seq), Cow::Owned(rc_qual))
} else {
(Cow::Borrowed(read2_seq), Cow::Borrowed(read2_qual))
};
let merge_result = merge_reads(
read1_seq,
read1_qual,
&read2_seq_processed,
&read2_qual_processed,
&algorithm,
min_overlap,
max_mismatch_rate,
max_mismatch_count,
);
let was_merged = match merge_result {
MergeResult::Merged {
merged_seq,
merged_qual,
} => {
reads[seg1_idx as usize].replace_seq(&merged_seq, &merged_qual);
reads[seg2_idx as usize].clear();
true
}
MergeResult::NoOverlap => {
if no_overlap_strategy == NoOverlapStrategy::Concatenate {
let spacer = concatenate_spacer
.as_ref()
.expect("concatenate_spacer must be Some in concatenate mode");
let mut concatenated_seq = read1_seq.to_vec();
concatenated_seq.extend_from_slice(spacer.as_bytes());
concatenated_seq.extend_from_slice(&read2_seq_processed);
let mut concatenated_qual = read1_qual.to_vec();
concatenated_qual.extend(std::iter::repeat_n(spacer_qual, spacer.len()));
concatenated_qual.extend_from_slice(&read2_qual_processed);
reads[seg1_idx as usize].replace_seq(&concatenated_seq, &concatenated_qual);
reads[seg2_idx as usize].clear();
}
false
}
};
if let Some(merge_status) = merge_status.borrow_mut().as_mut() {
merge_status.push(was_merged);
}
});
if let Some(merge_status) = merge_status.take() {
let tag_values: TagColumn = TagColumn::Bool(merge_status.into_iter().collect());
block.tags.insert(
self.out_label
.as_ref()
.expect("out_label must be set for merge type")
.clone(),
tag_values,
);
}
Ok((block, true))
}
}
#[derive(Debug)]
enum MergeResult {
Merged {
merged_seq: Vec<u8>,
merged_qual: Vec<u8>,
},
NoOverlap,
}
#[expect(clippy::too_many_arguments, reason = "we need them")]
fn merge_reads(
seq1: &[u8],
qual1: &[u8],
seq2: &[u8],
qual2: &[u8],
algorithm: &Algorithm,
min_overlap: usize,
max_mismatch_rate: f64,
max_mismatch_count: usize,
) -> MergeResult {
match algorithm {
Algorithm::Fastp => {
let ov = find_best_overlap_fastp(
seq1,
seq2,
min_overlap,
max_mismatch_rate,
max_mismatch_count,
);
if let Some((offset, overlap_len)) = ov {
let (merged_seq, merged_qual) =
merge_at_offset_fastp(seq1, qual1, seq2, qual2, offset, overlap_len);
MergeResult::Merged {
merged_seq,
merged_qual,
}
} else {
MergeResult::NoOverlap
}
}
}
}
#[expect(
clippy::cast_possible_truncation,
reason = "u64 to usize is fine for our target systems"
)]
#[expect(
clippy::cast_sign_loss,
clippy::cast_precision_loss,
reason = "max_mismatch_rate is 0..=1"
)]
#[expect(clippy::nonminimal_bool, reason = "it's clearer this way")]
fn find_best_overlap_fastp(
seq1: &[u8],
seq2: &[u8], min_overlap: usize,
max_mismatch_rate: f64,
max_mismatch_count: usize,
) -> Option<(isize, usize)> {
let len1: isize = seq1
.len()
.try_into()
.expect("seq1 len too large for isize. Max supported read size is 2^31 bases");
let len2: isize = seq2
.len()
.try_into()
.expect("seq2 len too large for isize. Max supported read size is 2^31 bases");
let complete_compare_require = 50;
let mut overlap_len;
let mut offset: isize = 0;
let mut diff;
let overlap_require: isize = min_overlap
.try_into()
.expect("min_overlap too large for isize");
let diff_percent_limit = max_mismatch_rate;
let diff_limit = max_mismatch_count;
let str1 = seq1;
let str2 = seq2;
while offset
< len1
.checked_sub(overlap_require)
.expect("Subtraction below range - how large is your min_overlap that you exceed an isize after substraction?")
{
overlap_len = (len1 - offset).min(len2);
let overlap_diff_limit = diff_limit.min((overlap_len as f64 * diff_percent_limit) as usize);
diff = 0;
let mut last_i = 0;
for i in 0..overlap_len {
if str1[(offset + i) as usize] != str2[i as usize] {
diff += 1;
if diff > overlap_diff_limit && i < complete_compare_require {
break;
}
}
last_i = i + 1;
}
if diff <= overlap_diff_limit
|| (diff > overlap_diff_limit && last_i > complete_compare_require)
{
return Some((offset, overlap_len as usize));
}
offset += 1;
}
offset = 0;
while offset > -(len2 - overlap_require) {
overlap_len = len1.min(len2 - (offset.abs()));
let overlap_diff_limit = diff_limit.min((overlap_len as f64 * diff_percent_limit) as usize);
diff = 0;
let mut last_i = 0;
for i in 0..overlap_len {
if str1[i as usize] != str2[(-offset + i) as usize] {
diff += 1;
if diff > overlap_diff_limit && i < complete_compare_require {
break;
}
}
last_i = i + 1;
}
if diff <= overlap_diff_limit
|| (diff > overlap_diff_limit && last_i > complete_compare_require)
{
return Some((offset, overlap_len as usize));
}
offset -= 1;
}
None
}
#[expect(clippy::cast_sign_loss, reason = "Sign is checked before")]
fn merge_at_offset_fastp(
seq1: &[u8],
qual1: &[u8],
seq2: &[u8],
qual2: &[u8],
offset: isize,
overlap_len: usize,
) -> (Vec<u8>, Vec<u8>) {
#[expect(clippy::too_many_arguments, reason = "we need them")]
fn append_overlap(
seq1: &[u8],
qual1: &[u8],
seq2: &[u8],
qual2: &[u8],
overlap_len: usize,
merged_seq: &mut Vec<u8>,
merged_qual: &mut Vec<u8>,
offset: usize,
inside_out: bool,
) {
for i in 0..overlap_len {
let (pos1, pos2) = if inside_out {
(i, offset + i)
} else {
(offset + i, i)
};
let base1 = seq1[pos1];
let base2 = seq2[pos2];
let q1 = qual1[pos1];
let q2 = qual2[pos2];
if base1 == base2 {
merged_seq.push(base1);
merged_qual.push(q1);
} else {
const GOOD_QUAL: u8 = 30 + 33;
const BAD_QUAL: u8 = 14 + 33;
if q2 >= GOOD_QUAL && q1 <= BAD_QUAL {
merged_seq.push(base2);
merged_qual.push(q2);
} else {
merged_seq.push(base1);
merged_qual.push(q1);
}
}
}
}
let mut merged_seq = Vec::new();
let mut merged_qual = Vec::new();
if offset >= 0 {
let offset = offset as usize;
merged_seq.extend_from_slice(&seq1[..offset]);
merged_qual.extend_from_slice(&qual1[..offset]);
append_overlap(
seq1,
qual1,
seq2,
qual2,
overlap_len,
&mut merged_seq,
&mut merged_qual,
offset,
false,
);
if offset > 0 && overlap_len < seq2.len() {
merged_seq.extend_from_slice(&seq2[overlap_len..]);
merged_qual.extend_from_slice(&qual2[overlap_len..]);
}
} else {
let offset = (-offset) as usize;
append_overlap(
seq1,
qual1,
seq2,
qual2,
overlap_len,
&mut merged_seq,
&mut merged_qual,
offset,
true,
);
}
(merged_seq, merged_qual)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_best_overlap_fastp() {
let seq1 = b"ACGTACGTACGT";
let seq2 = b"GTACGTACGTAA";
let result = find_best_overlap_fastp(seq1, seq2, 4, 0.2, 2);
assert_eq!(result, Some((2, 10)));
let result = find_best_overlap_fastp(b"AGTCAA", b"CTCCA", 4, 0.2, 2);
assert_eq!(result, None);
let result = find_best_overlap_fastp(b"AGTCAA", b"AGTCAA", 4, 0.2, 2);
assert_eq!(result, Some((0, 6))); let result = find_best_overlap_fastp(b"AGTCAA", b"ACAGTCAA", 4, 0.2, 2);
assert_eq!(result, Some((-2, 6)));
let r = merge_at_offset_fastp(
b"AAAAAAAAAAAAAAAAAA",
b"???@@@>>>./0./0./0",
b"TTTTTTTTTTTTTTTTTT",
b"./0./0./0???@@@>>>",
0,
18,
);
assert_eq!(&r.0, b"AAAAAAAAATTATTAAAA");
}
}