use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::iter::zip;
use chain::liftover;
use chainfile as chain;
use flate2::read::GzDecoder;
use noodles::fasta;
use noodles::fasta::record::Sequence;
use omics::coordinate::Contig;
use omics::coordinate::Strand;
use omics::coordinate::interbase::Coordinate;
use omics::coordinate::interval::interbase::Interval;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let chain_file_path = env::args().nth(1).expect("missing chainfile path");
let reference_fasta_path = env::args().nth(2).expect("missing reference fasta path");
let query_fasta_path = env::args().nth(3).expect("missing query fasta path");
let chain = File::open(chain_file_path)
.map(GzDecoder::new)
.map(BufReader::new)
.map(chain::Reader::new)?;
let machine = liftover::machine::builder::Builder.try_build_from(chain)?;
let mut reference = HashMap::new();
for result in fasta::reader::Builder
.build_from_path(reference_fasta_path)?
.records()
{
let record = result?;
let name = String::from_utf8_lossy(record.name()).to_string();
reference.insert(name, record.sequence().clone());
}
let mut query = HashMap::new();
for result in fasta::reader::Builder
.build_from_path(query_fasta_path)?
.records()
{
let record = result?;
let name = String::from_utf8_lossy(record.name()).to_string();
query.insert(name, record.sequence().clone());
}
let intervals = reference
.iter()
.map(|(name, reference_sequence)| {
Interval::try_new(
Coordinate::new(
Contig::new_unchecked(name.as_str()),
Strand::Positive,
0_u64,
),
Coordinate::new(
Contig::new_unchecked(name.as_str()),
Strand::Positive,
reference_sequence.len() as u64,
),
)
.unwrap()
})
.collect::<Vec<_>>();
let mut total_positions = 0usize;
let mut total_mismatches = 0usize;
let mut positive_positions = 0usize;
let mut positive_mismatches = 0usize;
let mut negative_positions = 0usize;
let mut negative_mismatches = 0usize;
for interval in intervals {
let reference_sequence = reference
.get(interval.contig().as_str())
.unwrap_or_else(|| {
panic!(
"reference FASTA did not contain necessary contig \"{}\"; are the FASTA files \
and chain file correct?",
interval.contig().as_str()
)
});
let results = match machine.liftover(interval.clone()) {
Some(results) => results,
None => {
println!("INFO: region {interval} is not included in the query genome.");
continue;
}
};
for chain_result in results {
for segment in chain_result.into_segments() {
let query_strand = segment.query().strand();
let query_sequence =
query
.get(segment.query().contig().as_str())
.unwrap_or_else(|| {
panic!(
"query FASTA did not contain necessary contig \"{}\": are the \
FASTA files and chain file correct?",
segment.query().contig()
)
});
let (reference_interval, query_interval) = segment.into_parts();
let reference = get_sequence_for_interval(reference_sequence, reference_interval);
let query = get_sequence_for_interval(query_sequence, query_interval);
assert_eq!(reference.len(), query.len());
let positions = query.len();
let mismatches = count_mismatches(&reference, &query);
total_positions += positions;
total_mismatches += mismatches;
if query_strand == Strand::Positive {
positive_positions += positions;
positive_mismatches += mismatches;
} else {
negative_positions += positions;
negative_mismatches += mismatches;
}
}
}
}
println!();
let total_matched = (1.0 - (total_mismatches as f64 / total_positions as f64)) * 100.0;
println!("Total mismatches found: {total_mismatches}");
println!("Total positions checked: {total_positions}");
println!("Percentage match for all regions: {total_matched:.1}%");
println!();
let positive_matched = (1.0 - (positive_mismatches as f64 / positive_positions as f64)) * 100.0;
println!("> Positive strand mismatches found: {positive_mismatches}");
println!("> Positive strand positions checked: {positive_positions}");
println!("> Percentage match for positive stranded query regions: {positive_matched:.1}%");
println!();
let negative_matched = (1.0 - (negative_mismatches as f64 / negative_positions as f64)) * 100.0;
println!("> Negative strand mismatches found: {negative_mismatches}");
println!("> Negative strand positions checked: {negative_positions}");
println!("> Percentage match for negative stranded regions: {negative_matched:.1}%");
Ok(())
}
fn get_sequence_for_interval(sequence: &Sequence, interval: Interval) -> Vec<Nucleotide> {
let strand = interval.strand();
let interval = parse_interval(interval);
let nucleotides = sequence
.slice(interval)
.expect("sequence lookup did not succeed: are the FASTA files and chain file correct?")
.as_ref()
.iter()
.map(Nucleotide::from)
.collect::<Vec<_>>();
match strand {
Strand::Positive => nucleotides,
Strand::Negative => nucleotides
.into_iter()
.rev()
.map(|n| n.complement())
.collect::<Vec<_>>(),
}
}
fn parse_interval(interval: Interval) -> noodles::core::region::Interval {
let interval = interval.into_equivalent_base();
let (start, end) = match interval.strand() {
Strand::Positive => (
interval.start().position().get() as usize,
interval.end().position().get() as usize,
),
Strand::Negative => (
interval.end().position().get() as usize,
interval.start().position().get() as usize,
),
};
let start = noodles::core::Position::new(start).unwrap();
let end = noodles::core::Position::new(end).unwrap();
noodles::core::region::Interval::from(start..=end)
}
#[derive(Debug, Eq, PartialEq)]
pub enum Nucleotide {
A,
C,
G,
T,
Other,
}
impl Nucleotide {
pub fn complement(self) -> Nucleotide {
match self {
Nucleotide::A => Nucleotide::T,
Nucleotide::C => Nucleotide::G,
Nucleotide::G => Nucleotide::C,
Nucleotide::T => Nucleotide::A,
Nucleotide::Other => Nucleotide::Other,
}
}
}
impl From<&u8> for Nucleotide {
fn from(value: &u8) -> Self {
match value {
b'A' => Nucleotide::A,
b'a' => Nucleotide::A,
b'C' => Nucleotide::C,
b'c' => Nucleotide::C,
b'G' => Nucleotide::G,
b'g' => Nucleotide::G,
b'T' => Nucleotide::T,
b't' => Nucleotide::T,
_ => Nucleotide::Other,
}
}
}
pub fn count_mismatches(reference: &[Nucleotide], query: &[Nucleotide]) -> usize {
let mut result = 0;
for (r, q) in zip(reference.iter(), query.iter()) {
if r != q {
result += 1;
}
}
result
}