use bed::feature::RecordBuf;
use bed::io::writer::Writer as BedWriter;
use bstr::ByteSlice;
use noodles_bed as bed;
use noodles_core::Position;
use noodles_fasta::Record as FastaRecord;
use regex::Regex;
use serde::Serialize;
use sha2::Digest;
use sha2::Sha256;
use std::error::Error;
use std::fs::File;
use std::path::Path;
use std::io::BufWriter;
use std::result::Result;
#[derive(Debug, Clone, Serialize)]
pub struct SequenceStatistics {
pub sequence_name: String,
pub non_masked_bases: usize,
pub soft_masked_bases: usize,
pub hard_masked_bases: usize,
pub non_masked_ratio: f64,
pub soft_masked_ratio: f64,
pub hard_masked_ratio: f64,
pub gc_content: f64,
pub other_iupac_bases: usize,
pub sequence_length: usize,
pub checksum_sha256: String,
}
pub fn process_fasta(
output_dir: Option<&Path>,
sequence_match_regex: &str,
ignore_iupac: bool,
) -> impl Fn(&FastaRecord) -> Option<SequenceStatistics> {
move |record| process_fasta_record(record, output_dir, sequence_match_regex, ignore_iupac)
}
fn process_fasta_record(
record: &FastaRecord,
output_dir: Option<&Path>,
sequence_match_regex: &str,
ignore_iupac: bool,
) -> Option<SequenceStatistics> {
let record_name: &str = record.definition().name().to_str().expect(
format!(
"Failed to convert record name to string: '{}'",
record.definition().name()
)
.as_str(),
);
let regex_matcher = Regex::new(ensure_full_match_regex(sequence_match_regex).as_str());
if regex_matcher.is_err() {
panic!("Invalid regular expression: '{}'", sequence_match_regex);
} else if !regex_matcher.unwrap().is_match(record_name) {
return None;
}
if record.sequence().len() == 0 {
return Some(SequenceStatistics {
sequence_name: record_name.to_string(),
non_masked_bases: 0,
soft_masked_bases: 0,
hard_masked_bases: 0,
non_masked_ratio: 0.0,
soft_masked_ratio: 0.0,
hard_masked_ratio: 0.0,
gc_content: 0.0,
other_iupac_bases: 0,
sequence_length: 0,
checksum_sha256: "".to_string(),
});
}
let mut sha256_hasher = Sha256::new();
let mut non_mask_bed_writer = create_bed_writer(output_dir, "non-masked", record_name);
let mut soft_mask_bed_writer = create_bed_writer(output_dir, "soft-masked", record_name);
let mut hard_mask_bed_writer = create_bed_writer(output_dir, "hard-masked", record_name);
let sequence: &[u8] = record.sequence().as_ref();
let mut index1: usize = 0;
let mut gc_counter: usize = 0;
let mut other_iupac_bases_counter: usize = 0;
let mut non_mask_counter: usize = 0;
let mut soft_mask_counter: usize = 0;
let mut hard_mask_counter: usize = 0;
let mut non_mask_region_start1: Option<usize> = None;
let mut soft_mask_region_start1: Option<usize> = None;
let mut hard_mask_region_start1: Option<usize> = None;
for base in sequence {
index1 += 1;
let mut non_masking: bool = false;
let mut soft_masking: bool = false;
let mut hard_masking: bool = false;
sha256_hasher.update([*base]);
match *base {
b'C' | b'G' => {
gc_counter += 1;
non_mask_counter += 1;
non_masking = true;
}
b'c' | b'g' => {
gc_counter += 1;
soft_mask_counter += 1;
soft_masking = true;
}
b'A' | b'T' => {
non_mask_counter += 1;
non_masking = true;
}
b'a' | b't' => {
soft_mask_counter += 1;
soft_masking = true;
}
b'n' | b'N' => {
hard_mask_counter += 1;
hard_masking = true;
}
_ => {
if !ignore_iupac {
panic!("Unexpected base: '{}' in sequence '{}'.", *base as char, record_name);
} else if !is_unsupported_iupac_code(*base) {
panic!("Unexpected (non-IUPAC) base: '{}' in sequence '{}'.", *base as char, record_name);
} else {
other_iupac_bases_counter += 1;
}
}
}
update_mask_region(&mut non_mask_region_start1, non_masking, non_mask_bed_writer.as_mut(), record_name, index1);
update_mask_region(&mut soft_mask_region_start1, soft_masking, soft_mask_bed_writer.as_mut(), record_name, index1);
update_mask_region(&mut hard_mask_region_start1, hard_masking, hard_mask_bed_writer.as_mut(), record_name, index1);
}
update_mask_region(&mut non_mask_region_start1, false, non_mask_bed_writer.as_mut(), record_name, index1 + 1);
update_mask_region(&mut soft_mask_region_start1, false, soft_mask_bed_writer.as_mut(), record_name, index1 + 1);
update_mask_region(&mut hard_mask_region_start1, false, hard_mask_bed_writer.as_mut(), record_name, index1 + 1);
assert!(
non_mask_counter + soft_mask_counter + hard_mask_counter + other_iupac_bases_counter == sequence.len(),
"The sum of masked bases does not match the sequence length ({}) for '{}'. This seems to be a bug.",
sequence.len(),
record_name
);
return Some(SequenceStatistics {
sequence_name: record_name.to_string(),
non_masked_bases: non_mask_counter,
soft_masked_bases: soft_mask_counter,
hard_masked_bases: hard_mask_counter,
non_masked_ratio: non_mask_counter as f64 / sequence.len() as f64,
soft_masked_ratio: soft_mask_counter as f64 / sequence.len() as f64,
hard_masked_ratio: hard_mask_counter as f64 / sequence.len() as f64,
gc_content: gc_counter as f64 / sequence.len() as f64,
other_iupac_bases: other_iupac_bases_counter,
sequence_length: sequence.len(),
checksum_sha256: format!("{:x}", sha256_hasher.finalize()),
});
}
fn create_bed_writer(
output_dir: Option<&Path>,
bed_ending: &str,
record_name: &str,
) -> Option<BedWriter<3, BufWriter<File>>> {
output_dir.and_then(|output_dir| {
let output_path = output_dir.join(&format!("{}.{}.bed", record_name, bed_ending));
Some(
bed::io::writer::Builder::default()
.build_from_path(output_path.clone())
.expect(&format!(
"Could not write to output BED file '{}'.",
output_path.to_str().unwrap()
)),
)
})
}
fn write_bed_record<X: std::io::Write>(
writer: &mut BedWriter<3, X>,
sequence_name: &str,
start1: usize,
end1: usize,
) -> Result<(), Box<dyn Error>> {
let record = RecordBuf::<3>::builder()
.set_reference_sequence_name(sequence_name)
.set_feature_start(Position::try_from(start1)?)
.set_feature_end(Position::try_from(end1 - 1)?) .build();
writer.write_feature_record(&record)?;
Ok(())
}
fn update_mask_region<X: std::io::Write>(
region_start: &mut Option<usize>,
masking: bool,
writer_opt: Option<&mut BedWriter<3, X>>,
record_name: &str,
index1: usize,
) {
if let Some(writer) = writer_opt {
if masking {
if region_start.is_none() {
*region_start = Some(index1);
}
} else if let Some(start1) = *region_start {
let _ = write_bed_record(writer, record_name, start1, index1);
*region_start = None;
}
}
}
fn is_unsupported_iupac_code(base: u8) -> bool {
matches!(base, b'b' | b'B' | b'd' | b'D' | b'h' | b'H' | b'k' | b'K' | b'm' | b'M' | b'r' | b'R' | b's' | b'S' | b'v' | b'V' | b'w' | b'W' | b'y' | b'Y')
}
fn ensure_full_match_regex(regex: &str) -> String {
let start_ok = regex.starts_with('^');
let end_ok = regex.ends_with('$');
if start_ok && end_ok {
regex.to_string()
} else if start_ok && !end_ok {
format!("{}$", regex)
} else if !start_ok && end_ok {
format!("^{}", regex)
} else {
format!("^{}$", regex)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ensure_full_match_regex_ok() {
assert_eq!(ensure_full_match_regex(".*"), "^.*$");
assert_eq!(ensure_full_match_regex("^.*"), "^.*$");
assert_eq!(ensure_full_match_regex(".*$"), "^.*$");
assert_eq!(ensure_full_match_regex("^.*$"), "^.*$");
assert_eq!(ensure_full_match_regex("abc"), "^abc$");
assert_eq!(ensure_full_match_regex("^abc"), "^abc$");
assert_eq!(ensure_full_match_regex("abc$"), "^abc$");
assert_eq!(ensure_full_match_regex("^abc$"), "^abc$");
}
#[test]
fn process_fasta_record_ok() -> Result<(), Box<dyn Error>> {
let record = FastaRecord::new(
noodles_fasta::record::Definition::new("test_sequence", None),
noodles_fasta::record::Sequence::from(
b"CTGTGCTGGCATAGTGGTCTCACCTCCGGCAGtatcaccaccactgggcacaagcttctccagcacagcaNNnNnactgtgtcttatttctccttgtactcccagtgttcacaccatgctgcactcacagaagactcttcgttgatattt".to_vec()),
);
let tmpdir = tempfile::tempdir()?;
let stats = process_fasta_record(&record, Some(&tmpdir.path()), ".*", false);
assert!(stats.is_some());
let stats = stats.unwrap();
let non_masked_bed_path = tmpdir.path().join("test_sequence.non-masked.bed");
let non_masked_bed = std::fs::read_to_string(non_masked_bed_path)?;
assert_eq!(non_masked_bed, "test_sequence\t0\t32\n");
let soft_masked_bed_path = tmpdir.path().join("test_sequence.soft-masked.bed");
let soft_masked_bed = std::fs::read_to_string(soft_masked_bed_path)?;
assert_eq!(soft_masked_bed, "test_sequence\t32\t70\ntest_sequence\t75\t150\n");
let hard_masked_bed_path = tmpdir.path().join("test_sequence.hard-masked.bed");
let hard_masked_bed = std::fs::read_to_string(hard_masked_bed_path)?;
assert_eq!(hard_masked_bed, "test_sequence\t70\t75\n");
assert_eq!(stats.sequence_name, "test_sequence");
assert_eq!(stats.non_masked_bases, 32);
assert_eq!(stats.non_masked_ratio, 32.0 / 150.0);
assert_eq!(stats.soft_masked_bases, 113);
assert_eq!(stats.soft_masked_ratio, 113.0 / 150.0);
assert_eq!(stats.hard_masked_bases, 5);
assert_eq!(stats.hard_masked_ratio, 5.0 / 150.0);
assert_eq!(stats.gc_content, 73.0 / 150.0);
assert_eq!(stats.other_iupac_bases, 0);
assert_eq!(stats.sequence_length, 150);
assert_eq!(stats.checksum_sha256, "39d1aba0a51cb46ce7cef81ac808bf46e3594b0ccabcd77fabf19c2a395651fa");
Ok(())
}
#[test]
fn process_fasta_record_non_iupac_codes_ok() -> Result<(), Box<dyn Error>> {
let record = FastaRecord::new(
noodles_fasta::record::Definition::new("iupac_test_sequence", None),
noodles_fasta::record::Sequence::from(b"mMrvy".to_vec()),
);
let tmpdir = tempfile::tempdir()?;
let stats = process_fasta_record(&record, Some(&tmpdir.path()), ".*", true);
assert!(stats.is_some());
let stats = stats.unwrap();
let non_masked_bed_path = tmpdir.path().join("test_sequence.non-masked.bed");
assert_eq!(non_masked_bed_path.exists(), false);
let soft_masked_bed_path = tmpdir.path().join("test_sequence.soft-masked.bed");
assert_eq!(soft_masked_bed_path.exists(), false);
let hard_masked_bed_path = tmpdir.path().join("test_sequence.hard-masked.bed");
assert_eq!(hard_masked_bed_path.exists(), false);
assert_eq!(stats.sequence_name, "iupac_test_sequence");
assert_eq!(stats.non_masked_bases, 0);
assert_eq!(stats.non_masked_ratio, 0.0);
assert_eq!(stats.soft_masked_bases, 0);
assert_eq!(stats.soft_masked_ratio, 0.0);
assert_eq!(stats.hard_masked_bases, 0);
assert_eq!(stats.hard_masked_ratio, 0.0);
assert_eq!(stats.gc_content, 0.0);
assert_eq!(stats.other_iupac_bases, 5);
assert_eq!(stats.sequence_length, 5);
assert_eq!(stats.checksum_sha256, "fed4eff301b269d726775e3a29cdc4b015b3b5d9e2d771ecf9964ccac590d30f");
Ok(())
}
}