use core::panic;
use std::collections::HashSet;
use std::fs::File;
use std::io::BufRead;
use std::io::BufReader;
use std::io::BufWriter;
use std::io::Write;
fn switch_base(base: char) -> char{
match base {
'A' => 'T',
'T' => 'A',
'C' => 'G',
'G' => 'C',
'N' => 'N',
_ => panic!("unknown base")
}
}
pub fn reverse_complement(seq: &str) -> String {
let mut rc = String::with_capacity(seq.len());
for c in seq.chars().rev() {
rc.push(switch_base(c))
}
rc
}
#[derive(Debug)]
pub struct FastqEntry {
pub header: String,
pub seq: String,
pub phred: String,
}
impl FastqEntry {
pub fn to_string(&self) -> String {
let mut s = String::with_capacity(self.header.len() + self.seq.len() * 2 + 5); s.push_str(&self.header);
s.push('\n');
s.push_str(&self.seq);
s.push_str("\n+\n");
s.push_str(&self.phred);
s.push('\n');
s
}
}
use core::str;
use noodles::bgzf as noodles_bgzf;
use noodles::fastq as fastq;
pub struct FastIterator {
reader: fastq::Reader<noodles_bgzf::Reader<File>>,
buffer: fastq::Record,
}
impl FastIterator {
pub fn new(fastqname: &str) -> Self {
let decoder = noodles_bgzf::reader::Builder.build_from_path(fastqname).unwrap();
let reader = fastq::io::Reader::new(decoder);
FastIterator {
reader ,
buffer: fastq::Record::new(fastq::record::Definition::new("r0", ""), "AGCT", "NDLS")
}
}
}
fn noodles_record_to_fastq_entry(record: &fastq::Record) -> FastqEntry {
let seq_bytes = record.sequence();
let phred_bytes = record.quality_scores();
let phred_string = str::from_utf8(phred_bytes).unwrap();
let seq_string = str::from_utf8(seq_bytes).unwrap();
let mut name_string = record.name().to_string();
let desc = record.description().to_string();
name_string.push(' ');
name_string.push_str(&desc);
FastqEntry {
seq: seq_string.to_owned(),
phred: phred_string.to_owned(),
header: name_string
}
}
#[test]
fn test_noodle(){
use crate::test_files::TEST_FASTQ_R1;
let mut f = FastIterator::new(TEST_FASTQ_R1) ;
for fq_entry in f.reader.records().take(5).flatten() {
let fff = noodles_record_to_fastq_entry(&fq_entry);
println!("{}", fff.to_string());
}
}
#[test]
fn test_noodle2(){
use crate::test_files::TEST_FASTQ_R1;
let f = FastIterator::new(TEST_FASTQ_R1);
for r in f.take(10) {
println!("{}", r.to_string());
}
}
#[test]
fn test_noodle3(){
use crate::test_files::TEST_FASTQ_R1;
let mut f = FastIterator::new("/tmp/foo.fastq.gz");
let x = f.next();
println!("GGGGGGGGGGGGGG {:?}", x);
let s = f.next();
println!("sssssssssssss {:?}", s);
for r in f.take(10) {
println!("F: {}", r.to_string());
}
println!("F:");
}
impl Iterator for FastIterator {
type Item = FastqEntry;
fn next(&mut self) -> Option<Self::Item> {
let nread = self.reader.read_record(&mut self.buffer).unwrap();
if nread == 0 {
None
} else {
Some(noodles_record_to_fastq_entry(&self.buffer))
}
}
}
use once_cell::sync::Lazy;
pub static PHRED_LOOKUP: Lazy<PhredCache> = Lazy::new(|| {
let lookup = PhredCache::new();
lookup
});
pub struct PhredCache {
cache: Vec<f32>,
}
impl PhredCache {
pub fn new() -> Self {
let mut cache: Vec<f32> = Vec::new();
for i in 33..76 {
let c: char = i.into();
let p = phred_symbol_to_prob(c);
cache.push(p);
}
PhredCache { cache }
}
pub fn get_prob(&self, c: char) -> f32 {
let i = (c as u32) - 33;
*self.cache.get(i as usize).expect(&format!("{c} unknown"))
}
}
fn phred_symbol_to_prob(phred: char) -> f32 {
let q = (phred as u32) - 33;
10_f32.powf(-(q as f32) / 10_f32)
}
fn get_bgzf_writer(outname: &str) -> noodles_bgzf::Writer<BufWriter<File>> {
let inner = BufWriter::new(File::create(outname).unwrap());
noodles_bgzf::Writer::new(inner)
}
pub fn quality_filter(fastqname: &str, outname: &str, threshold_qc: f32) {
let fastq_iter = FastIterator::new(fastqname);
let cache = PhredCache::new();
let mut writer = get_bgzf_writer(outname);
let mut total_reads = 0;
let mut passing_reads = 0;
for fq in fastq_iter {
total_reads += 1;
let probs: f32 = fq.phred.chars().map(|c| cache.get_prob(c)).sum();
let avg_qual = probs / (fq.phred.len() as f32);
if avg_qual < threshold_qc {
write!(writer, "{}", fq.to_string()).unwrap();
passing_reads += 1;
}
}
println!(
"{}/{}({}) reads passed QC",
passing_reads,
total_reads,
(passing_reads as f32) / (total_reads as f32)
)
}
pub fn read_filter_whitelist(fastqname: &str, outname: &str, whitelist: &str) {
let whitelist_reader = BufReader::new(File::open(whitelist).unwrap());
let whitelist_header: HashSet<String> = whitelist_reader.lines().map(|f| f.unwrap()).collect();
let fastq_iter = FastIterator::new(fastqname);
let mut writer = get_bgzf_writer(outname);
let mut total_reads = 0;
let mut passing_reads = 0;
for fq in fastq_iter {
total_reads += 1;
if whitelist_header.contains(&fq.header) {
write!(writer, "{}", fq.to_string()).unwrap();
passing_reads += 1;
}
}
println!(
"{}/{}({}) reads were whitelisted",
passing_reads,
total_reads,
(passing_reads as f32) / (total_reads as f32)
)
}
pub fn fastq_list_iter(fastq_list: &[String]) -> impl Iterator<Item = FastqEntry> + '_ {
let my_iter = fastq_list
.iter()
.flat_map(move |fname| FastIterator::new(fname));
my_iter
}
pub fn fastq_phred_iter(fastq_list: &[String]) -> impl Iterator<Item = String> + '_ {
fastq_list_iter(fastq_list).map(|fq| fq.phred)
}
pub fn parse_whitelist_gz(fname: &String) -> HashSet<String> {
let reader = noodles_bgzf::Reader::new(File::open(fname).unwrap());
let my_iter = reader.lines(); let mut hset: HashSet<String> = HashSet::new();
for line in my_iter.flatten() {
hset.insert(line);
}
hset
}
#[cfg(test)]
mod testing {
use crate::io::reverse_complement;
use super::{fastq_list_iter, quality_filter, FastqEntry, PhredCache};
use rust_htslib::bgzf;
use rust_htslib::bgzf::CompressionLevel;
use std::io::BufWriter;
use std::io::Write;
fn test_make_fastq() {
let n = 1_000_000_usize;
let out = "/tmp/test.fastq.gz";
let encoder = bgzf::Writer::from_path_with_level(out, CompressionLevel::Fastest).unwrap();
let mut writer = BufWriter::new(encoder);
let dummyseq = "A".repeat(150);
let dummphred = "F".repeat(150);
for i in 0..n {
let fq = FastqEntry {
header: format!("@Read{i}"),
seq: dummyseq.clone(),
phred: dummphred.clone(),
};
write!(writer, "{}", fq.to_string()).unwrap();
}
}
#[test]
fn test_phred_cache() {
let cache = PhredCache::new();
assert_eq!(0.0001, cache.get_prob('I')); assert_eq!(0.001, cache.get_prob('?')); assert_eq!(0.01, cache.get_prob('5')); assert_eq!(0.1, cache.get_prob('+')); assert_eq!(1_f32, cache.get_prob('!'));
}
pub fn test_filter() {
let file = "/home/michi/mounts/TB4drive/ISB_data/LT_pilot/LT_pilot/raw_data/Ice1/Ice1_CKDL210025651-1a-SI_TT_D2_HVWMHDSX2_S8_L001_R2_001.fastq.gz";
let out = "/tmp/filtered.fastq.gz";
println!("Filtering!");
use std::time::Instant;
let now = Instant::now();
quality_filter(file, out, 0.01);
let elapsed_time = now.elapsed();
println!("Running took {} sec.", elapsed_time.as_secs());
}
#[test]
fn test_fastq() {
let fastq_entry1 = "@some_read_id
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGGCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
";
let fastq_entry2 = "@another_read_id
GGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
";
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
let fastqname = "/tmp/foo.fastq";
let f = File::create(fastqname).expect("Unable to create file");
let mut f = BufWriter::new(f);
f.write_all(fastq_entry1.as_bytes())
.expect("Unable to write data");
f.write_all(fastq_entry2.as_bytes())
.expect("Unable to write data");
f.flush().unwrap();
let lines: Vec<_> = fastq_list_iter(&vec![fastqname.to_string()])
.map(|fq| fq.header)
.collect();
assert_eq!(lines, vec!["@some read id", "@another read id"]);
let lines: Vec<_> = fastq_list_iter(&vec![fastqname.to_string()])
.map(|fq| fq.seq)
.collect();
assert_eq!(
lines,
vec![
"AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGGCCCC",
"GGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTT"
]
);
let lines: Vec<_> = fastq_list_iter(&vec![fastqname.to_string()])
.map(|fq| fq.phred)
.collect();
assert_eq!(
lines,
vec![
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
]
);
}
#[test]
fn test_rc(){
assert_eq!(
reverse_complement("AAGG"), "CCTT"
);
assert_eq!(
reverse_complement("AAAA"), "TTTT"
);
assert_eq!(
reverse_complement("ATGC"), "GCAT"
);
}
}