use std::path::Path;
use crate::model::block::AbsoluteBlock;
use crate::model::error::ChainError;
use crate::seq::revcomp::reverse_complement_in_place;
use crate::seq::score::gapcalc::GapCalc;
use crate::seq::score::scoring::{CompactMatrix, ScoreMatrix};
use crate::seq::sequence::{SequenceCache, SequenceResolver};
use crate::{OwnedChain, Strand};
#[derive(Debug, Clone, Copy)]
pub struct ScoreConfig {
pub min_score: i64,
}
impl Default for ScoreConfig {
fn default() -> Self {
ScoreConfig { min_score: 1000 }
}
}
#[derive(Debug, Clone)]
pub struct ChainScorer {
reference: SequenceResolver,
query: SequenceResolver,
matrix: ScoreMatrix,
compact: CompactMatrix,
gap: GapCalc,
}
impl ChainScorer {
pub fn new<P: AsRef<Path>, Q: AsRef<Path>>(
reference: P,
query: Q,
matrix: ScoreMatrix,
gap: GapCalc,
) -> Result<Self, ChainError> {
let compact = matrix.compact();
Ok(Self {
reference: SequenceResolver::new(reference)?,
query: SequenceResolver::new(query)?,
matrix,
compact,
gap,
})
}
pub fn matrix(&self) -> &ScoreMatrix {
&self.matrix
}
pub fn score_chain(
&self,
cache: &mut SequenceCache,
chain: &OwnedChain,
) -> Result<i64, ChainError> {
let t_len = span_len(chain.reference_start, chain.reference_end, "target")?;
let q_len = span_len(chain.query_start, chain.query_end, "query")?;
let t_seq =
self.reference
.fetch(cache, &chain.reference_name, chain.reference_start, t_len)?;
let q_seq = match chain.query_strand {
Strand::Plus => self
.query
.fetch(cache, &chain.query_name, chain.query_start, q_len)?,
Strand::Minus => {
let fetch_start = chain
.query_size
.checked_sub(chain.query_end)
.ok_or_else(|| score_error("query minus-strand fetch underflows"))?;
let mut seq = self
.query
.fetch(cache, &chain.query_name, fetch_start, q_len)?;
reverse_complement_in_place(&mut seq);
seq
}
};
let mut score: i64 = 0;
let mut t_pos = chain.reference_start;
let mut q_pos = chain.query_start;
let block_count = chain.blocks.len();
for (index, block) in chain.blocks.iter().enumerate() {
let size = block.size as usize;
let t_off = (t_pos - chain.reference_start) as usize;
let q_off = (q_pos - chain.query_start) as usize;
let t_block = t_seq
.get(t_off..t_off + size)
.ok_or_else(|| score_error("target block exceeds fetched sequence"))?;
let q_block = q_seq
.get(q_off..q_off + size)
.ok_or_else(|| score_error("query block exceeds fetched sequence"))?;
score += score_ungapped_slices(q_block, t_block, &self.compact);
if index + 1 < block_count {
let dq = gap_to_i32(block.gap_query, "query")?;
let dt = gap_to_i32(block.gap_reference, "reference")?;
let cost = self.gap.cost(dq, dt);
score -= i64::from(cost);
}
t_pos += block.size + block.gap_reference;
q_pos += block.size + block.gap_query;
}
Ok(score)
}
}
#[inline]
pub fn score_ungapped_slices(query: &[u8], reference: &[u8], matrix: &CompactMatrix) -> i64 {
assert_eq!(
query.len(),
reference.len(),
"ungapped scoring requires equal-length slices"
);
let mut score = 0i64;
for i in 0..query.len() {
score += i64::from(matrix.pair(query[i], reference[i]));
}
score
}
pub fn score_absolute_block(
block: AbsoluteBlock,
query_seq: &[u8],
reference_seq: &[u8],
matrix: &CompactMatrix,
) -> Result<i64, ChainError> {
validate_score_block(block)?;
let reference = absolute_slice(
reference_seq,
block.reference_start,
block.reference_end,
"reference",
)?;
let query = absolute_slice(query_seq, block.query_start, block.query_end, "query")?;
Ok(score_ungapped_slices(query, reference, matrix))
}
pub fn score_absolute_blocks(
blocks: &[AbsoluteBlock],
query_seq: &[u8],
reference_seq: &[u8],
matrix: &CompactMatrix,
gap: &GapCalc,
) -> Result<i64, ChainError> {
if blocks.is_empty() {
return Err(score_error("absolute block list is empty"));
}
let mut score = 0i64;
for (index, block) in blocks.iter().copied().enumerate() {
score += score_absolute_block(block, query_seq, reference_seq, matrix)?;
if let Some(next) = blocks.get(index + 1).copied() {
if next.reference_start <= block.reference_start {
return Err(score_error(
"absolute blocks are not sorted by reference start",
));
}
if next.query_start <= block.query_start {
return Err(score_error("absolute blocks are not sorted by query start"));
}
let dt = next
.reference_start
.checked_sub(block.reference_end)
.ok_or_else(|| score_error("absolute block reference coordinates overlap"))?;
let dq = next
.query_start
.checked_sub(block.query_end)
.ok_or_else(|| score_error("absolute block query coordinates overlap"))?;
score -= i64::from(gap.cost(gap_to_i32(dq, "query")?, gap_to_i32(dt, "reference")?));
}
}
Ok(score)
}
fn span_len(start: u32, end: u32, label: &str) -> Result<u32, ChainError> {
end.checked_sub(start)
.ok_or_else(|| score_error(format!("{label} span underflows")))
}
fn validate_score_block(block: AbsoluteBlock) -> Result<(), ChainError> {
block.validate()?;
if block.aligned_len().is_none() {
return Err(score_error(
"absolute block reference and query lengths differ",
));
}
Ok(())
}
fn absolute_slice<'a>(
seq: &'a [u8],
start: u32,
end: u32,
label: &str,
) -> Result<&'a [u8], ChainError> {
if end < start {
return Err(score_error(format!("{label} span underflows")));
}
seq.get(start as usize..end as usize)
.ok_or_else(|| score_error(format!("{label} absolute block exceeds sequence")))
}
fn gap_to_i32(gap: u32, label: &str) -> Result<i32, ChainError> {
i32::try_from(gap).map_err(|_| score_error(format!("{label} gap exceeds i32 range")))
}
fn score_error(message: impl Into<String>) -> ChainError {
ChainError::Unsupported {
msg: message.into().into(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{AbsoluteBlock, Block};
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
use std::path::PathBuf;
use std::sync::atomic::{AtomicUsize, Ordering};
use twobit::convert::fasta::FastaReader;
use twobit::convert::to_2bit;
static NEXT_TEMP_ID: AtomicUsize = AtomicUsize::new(0);
struct TempDir {
path: PathBuf,
}
impl TempDir {
fn new() -> Self {
let id = NEXT_TEMP_ID.fetch_add(1, Ordering::Relaxed);
let path = std::env::temp_dir().join(format!(
"chaintools-chainscore-test-{}-{id}",
std::process::id()
));
std::fs::create_dir_all(&path).expect("create temp dir");
Self { path }
}
fn path(&self) -> &Path {
&self.path
}
}
impl Drop for TempDir {
fn drop(&mut self) {
let _ = std::fs::remove_dir_all(&self.path);
}
}
fn write_twobit(path: &Path, fasta: &str) {
let reader = FastaReader::mem_open(fasta.as_bytes().to_vec()).expect("open FASTA");
let mut writer = BufWriter::new(File::create(path).expect("create 2bit"));
to_2bit(&mut writer, &reader).expect("write 2bit");
writer.flush().expect("flush 2bit");
}
#[allow(clippy::too_many_arguments)]
fn single_block_chain(
reference_name: &str,
reference_size: u32,
query_name: &str,
query_size: u32,
query_strand: Strand,
query_start: u32,
query_end: u32,
size: u32,
) -> OwnedChain {
OwnedChain {
score: 0,
reference_name: reference_name.as_bytes().to_vec(),
reference_size,
reference_strand: Strand::Plus,
reference_start: 0,
reference_end: size,
query_name: query_name.as_bytes().to_vec(),
query_size,
query_strand,
query_start,
query_end,
id: 1,
blocks: vec![Block {
size,
gap_reference: 0,
gap_query: 0,
}],
}
}
fn scorer(reference: &Path, query: &Path) -> ChainScorer {
ChainScorer::new(
reference,
query,
ScoreMatrix::default_dna(),
GapCalc::default_costs(),
)
.expect("build scorer")
}
#[test]
fn score_ungapped_slices_scores_query_against_reference() {
let compact = ScoreMatrix::default_dna().compact();
assert_eq!(score_ungapped_slices(b"CN", b"AC", &compact), -114);
}
#[test]
fn score_absolute_block_scores_full_sequence_slices() {
let compact = ScoreMatrix::default_dna().compact();
let block = AbsoluteBlock {
reference_start: 2,
reference_end: 6,
query_start: 2,
query_end: 6,
};
let score =
score_absolute_block(block, b"GGACGTCC", b"TTACGTAA", &compact).expect("score block");
assert_eq!(score, 382);
}
#[test]
fn score_absolute_blocks_subtracts_query_gap_cost() {
let compact = ScoreMatrix::default_dna().compact();
let gap = GapCalc::default_costs();
let blocks = [
AbsoluteBlock {
reference_start: 0,
reference_end: 4,
query_start: 0,
query_end: 4,
},
AbsoluteBlock {
reference_start: 4,
reference_end: 8,
query_start: 6,
query_end: 10,
},
];
let score = score_absolute_blocks(&blocks, b"ACGTNNACGT", b"ACGTACGT", &compact, &gap)
.expect("score absolute blocks");
assert_eq!(score, 404);
}
#[test]
fn score_absolute_blocks_rejects_overlap() {
let compact = ScoreMatrix::default_dna().compact();
let gap = GapCalc::default_costs();
let blocks = [
AbsoluteBlock {
reference_start: 0,
reference_end: 4,
query_start: 0,
query_end: 4,
},
AbsoluteBlock {
reference_start: 3,
reference_end: 7,
query_start: 4,
query_end: 8,
},
];
let err =
score_absolute_blocks(&blocks, b"ACGTACGT", b"ACGTACGT", &compact, &gap).unwrap_err();
assert!(err.to_string().contains("reference coordinates overlap"));
}
#[test]
fn single_block_acgt_plus_plus() {
let temp = TempDir::new();
let reference = temp.path().join("t.2bit");
let query = temp.path().join("q.2bit");
write_twobit(&reference, ">chr1\nACGT\n");
write_twobit(&query, ">chr1\nACGT\n");
let scorer = scorer(&reference, &query);
let mut cache = SequenceCache::default();
let chain = single_block_chain("chr1", 4, "chr1", 4, Strand::Plus, 0, 4, 4);
assert_eq!(scorer.score_chain(&mut cache, &chain).unwrap(), 382);
}
#[test]
fn single_block_eight_bases() {
let temp = TempDir::new();
let reference = temp.path().join("t.2bit");
let query = temp.path().join("q.2bit");
write_twobit(&reference, ">chr1\nACGTACGT\n");
write_twobit(&query, ">chr1\nACGTACGT\n");
let scorer = scorer(&reference, &query);
let mut cache = SequenceCache::default();
let chain = single_block_chain("chr1", 8, "chr1", 8, Strand::Plus, 0, 8, 8);
assert_eq!(scorer.score_chain(&mut cache, &chain).unwrap(), 764);
}
#[test]
fn mismatch_and_degenerate_contributions() {
let temp = TempDir::new();
let reference = temp.path().join("t.2bit");
let query = temp.path().join("q.2bit");
write_twobit(&reference, ">chr1\nAC\n");
write_twobit(&query, ">chr1\nCN\n");
let scorer = scorer(&reference, &query);
let mut cache = SequenceCache::default();
let chain = single_block_chain("chr1", 2, "chr1", 2, Strand::Plus, 0, 2, 2);
assert_eq!(scorer.score_chain(&mut cache, &chain).unwrap(), -114);
}
#[test]
fn two_blocks_with_query_gap() {
let temp = TempDir::new();
let reference = temp.path().join("t.2bit");
let query = temp.path().join("q.2bit");
write_twobit(&reference, ">chr1\nACGTACGT\n");
write_twobit(&query, ">chr1\nACGTNNACGT\n");
let scorer = scorer(&reference, &query);
let mut cache = SequenceCache::default();
let chain = OwnedChain {
score: 0,
reference_name: b"chr1".to_vec(),
reference_size: 8,
reference_strand: Strand::Plus,
reference_start: 0,
reference_end: 8,
query_name: b"chr1".to_vec(),
query_size: 10,
query_strand: Strand::Plus,
query_start: 0,
query_end: 10,
id: 1,
blocks: vec![
Block {
size: 4,
gap_reference: 0,
gap_query: 2,
},
Block {
size: 4,
gap_reference: 0,
gap_query: 0,
},
],
};
assert_eq!(scorer.score_chain(&mut cache, &chain).unwrap(), 404);
}
#[test]
fn minus_strand_query_matches_plus_equivalent() {
let temp = TempDir::new();
let reference = temp.path().join("t.2bit");
let query = temp.path().join("q.2bit");
write_twobit(&reference, ">chr1\nAGTC\n");
write_twobit(&query, ">chr1\nTTGACTAA\n");
let scorer = scorer(&reference, &query);
let mut cache = SequenceCache::default();
let chain = single_block_chain("chr1", 4, "chr1", 8, Strand::Minus, 2, 6, 4);
assert_eq!(scorer.score_chain(&mut cache, &chain).unwrap(), 382);
}
}