use fnv;
use crate::seq;
use std;
use std::cmp;
use needletail::parse_fastx_file;
use packed_seq::PackedSeqVec;
use packed_seq::SeqVec;
use simd_minimizers::canonical_minimizers;
use seq_hash::NtHasher;
pub fn read_fasta(filename: String) -> Vec<String> {
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTA/FASTQ file {}", filename));
let mut vec: Vec<String> = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("Invalid FASTA/FASTQ record");
let seq = record.seq();
if seq.len() > 1 {
vec.push(String::from_utf8(seq.to_vec()).expect("Non UTF-8 sequence"));
}
}
vec
}
pub fn read_fasta_mf(filename: String) -> (Vec<String>, Vec<String>) {
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTA/FASTQ file {}", filename));
let mut labels: Vec<String> = Vec::new();
let mut seqs: Vec<String> = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("Invalid FASTA/FASTQ record");
labels.push(String::from_utf8(record.id().to_vec()).expect("Non UTF-8 id"));
let seq = record.seq();
if seq.len() > 1 {
seqs.push(String::from_utf8(seq.to_vec()).expect("Non UTF-8 sequence"));
}
}
(labels, seqs)
}
#[inline]
pub fn kmerize_vector(
v: &Vec<String>,
k: usize,
d: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for l in v {
if l.len() < k {
continue;
} else {
let length_l = l.len();
if length_l < k {
continue;
} else {
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if seq::has_no_n(l[i..i + k].as_bytes()) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map
.entry(l[i..i + k].to_string().to_uppercase())
.or_insert(0);
*count += 1;
} else {
let count = map
.entry(
l_r[length_l - (i + k)..length_l - i]
.to_string()
.to_uppercase(),
)
.or_insert(0);
*count += 1;
}
}
}
}
}
}
map
}
#[inline]
pub fn kmerize_vector_set(
v: &Vec<String>,
k: usize,
d: usize,
) -> fnv::FnvHashSet<std::string::String> {
let mut set = fnv::FnvHashSet::default();
for l in v {
if l.len() < k {
continue;
} else {
let length_l = l.len();
if length_l < k {
continue;
} else {
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if seq::has_no_n(l[i..i + k].as_bytes()) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
set.insert(l[i..i + k].to_string().to_uppercase());
} else {
set.insert(
l_r[length_l - (i + k)..length_l - i]
.to_string()
.to_uppercase(),
);
}
}
}
}
}
}
set
}
#[inline]
pub fn kmerize_vector_uppercase(
v: Vec<String>,
k: usize,
d: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for l in v {
let length_l = l.len();
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map.entry(l[i..i + k].to_string()).or_insert(0);
*count += 1;
} else {
let count = map
.entry(l_r[length_l - (i + k)..length_l - i].to_string())
.or_insert(0);
*count += 1;
}
}
}
map
}
#[inline]
pub fn kmerize_vector_skip_n(
v: &Vec<String>,
k: usize,
d: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for l in v {
let length_l = l.len();
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if seq::has_no_n(l[i..i + k].as_bytes()) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map.entry(l[i..i + k].to_string()).or_insert(0);
*count += 1;
} else {
let count = map
.entry(l_r[length_l - (i + k)..length_l - i].to_string())
.or_insert(0);
*count += 1;
}
} else {
continue;
}
}
}
map
}
#[inline]
pub fn kmerize_vector_skip_n_set(
v: &Vec<String>,
k: usize,
d: usize,
) -> fnv::FnvHashSet<std::string::String> {
let mut map = fnv::FnvHashSet::default();
for l in v {
let length_l = l.len();
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if seq::has_no_n(l[i..i + k].as_bytes()) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
map.insert(l[i..i + k].to_string());
} else {
map.insert(l_r[length_l - (i + k)..length_l - i].to_string());
}
} else {
continue;
}
}
}
map
}
#[inline]
pub fn kmerize_vector_skip_n_set_str(
v: &Vec<&str>,
k: usize,
d: usize,
) -> fnv::FnvHashSet<std::string::String> {
let mut map = fnv::FnvHashSet::default();
for l in v {
let length_l = l.len();
let l_r = revcomp(&l);
for i in (0..l.len() - k + 1).step_by(d) {
if seq::has_no_n(l[i..i + k].as_bytes()) {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
map.insert(l[i..i + k].to_string());
} else {
map.insert(l_r[length_l - (i + k)..length_l - i].to_string());
}
} else {
continue;
}
}
}
map
}
#[inline]
pub fn kmerize_string(l: String, k: usize) -> Option<fnv::FnvHashMap<std::string::String, usize>> {
let mut map = fnv::FnvHashMap::default();
let length_l = l.len();
let l_r = revcomp(&l);
if length_l < k {
None
} else {
Some({
for i in 0..l.len() - k + 1 {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map
.entry(l[i..i + k].to_string().to_uppercase())
.or_insert(0);
*count += 1;
} else {
let count = map
.entry(
l_r[length_l - (i + k)..length_l - i]
.to_string()
.to_uppercase(),
)
.or_insert(0);
*count += 1;
}
}
map
})
}
}
pub fn minimerize_vector(
v: Vec<String>,
k: usize,
m: usize,
d: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for l in v {
let len = l.len();
if len < k || len < m {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len().saturating_sub(m).saturating_add(1);
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for (idx, &pos_u32) in minimizer_positions.iter().enumerate() {
if d > 1 && (idx % d != 0) {
continue;
}
let pos = pos_u32 as usize;
if pos + m > len {
continue;
}
let minmer = &l[pos..pos + m];
*map.entry(minmer.to_string()).or_insert(0) += 1;
}
}
map
}
pub fn minimerize_vector_skip_n(
v: &Vec<String>,
k: usize,
m: usize,
d: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for l in v {
let len = l.len();
if len < k || len < m {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len().saturating_sub(m).saturating_add(1);
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for (idx, &pos_u32) in minimizer_positions.iter().enumerate() {
if d > 1 && (idx % d != 0) {
continue;
}
let pos = pos_u32 as usize;
if pos + m > len {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
*map.entry(minmer.to_ascii_uppercase()).or_insert(0) += 1;
}
}
map
}
pub fn minimerize_vector_skip_n_set(
v: &Vec<String>,
k: usize,
m: usize,
d: usize,
) -> fnv::FnvHashSet<std::string::String> {
let mut set = fnv::FnvHashSet::default();
for l in v {
let len = l.len();
if len < k || len < m {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len().saturating_sub(m).saturating_add(1);
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for (idx, &pos_u32) in minimizer_positions.iter().enumerate() {
if d > 1 && (idx % d != 0) {
continue;
}
let pos = pos_u32 as usize;
if pos + m > len {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
set.insert(minmer.to_ascii_uppercase());
}
}
set
}
pub fn minimerize_vector_skip_n_set_str(
v: &Vec<&str>,
k: usize,
m: usize,
d: usize,
) -> fnv::FnvHashSet<std::string::String> {
let mut set = fnv::FnvHashSet::default();
for l in v {
let l: &str = *l;
let len = l.len();
if len < k || len < m {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len().saturating_sub(m).saturating_add(1);
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for (idx, &pos_u32) in minimizer_positions.iter().enumerate() {
if d > 1 && (idx % d != 0) {
continue;
}
let pos = pos_u32 as usize;
if pos + m > len {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
set.insert(minmer.to_ascii_uppercase());
}
}
set
}
pub fn kmers_from_fq(filename: String, k: usize) -> fnv::FnvHashMap<String, usize> {
let mut map = fnv::FnvHashMap::default();
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ/FASTA file {}", filename));
while let Some(record) = reader.next() {
let record = record.expect("Invalid FASTQ/FASTA record");
let seq = record.seq();
if seq.len() < k {
continue;
}
let l = String::from_utf8(seq.to_vec()).expect("Non UTF-8 sequence");
let length_l = l.len();
if length_l < k {
continue;
}
let k1 = k / 2;
for i in 0..=(length_l - k) {
let end = i + k;
let substring = &l[i..end];
if substring.contains('N') {
continue;
}
let substring_rev = revcomp(substring);
let canonical = match substring[..k1].cmp(&substring_rev[..k1]) {
cmp::Ordering::Less => substring.to_string(),
cmp::Ordering::Greater => substring_rev,
cmp::Ordering::Equal => {
if substring[k1..] <= substring_rev[k1..] {
substring.to_string()
} else {
substring_rev
}
}
};
let counter = map.entry(canonical).or_insert(0);
*counter += 1;
}
}
map
}
pub fn kmers_from_fq_qual(
filename: String,
k: usize,
_d: usize,
qual_offset: u8,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ file {}", filename));
let d = 1usize;
while let Some(record_res) = reader.next() {
let record = record_res.expect("Invalid FASTQ record");
let qual = match record.qual() {
Some(q) => q,
None => continue,
};
let seq_bytes = record.seq();
if seq_bytes.is_empty() {
continue;
}
let seq_str = String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence");
let qual_str = String::from_utf8(qual.to_vec()).expect("Non UTF-8 quality string");
let masked = seq::qual_mask(&seq_str, &qual_str, qual_offset);
let l = masked;
let length_l = l.len();
if length_l < k {
continue;
}
let l_r = revcomp(&l);
for i in 0..=length_l - k {
if i % d != 0 {
continue;
}
if !seq::has_no_n(l[i..i + k].as_bytes()) {
continue;
}
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map.entry(l[i..i + k].to_string()).or_insert(0);
*count += 1;
} else {
let count = map
.entry(l_r[length_l - (i + k)..length_l - i].to_string())
.or_insert(0);
*count += 1;
}
}
}
map
}
pub fn kmers_from_fq_minimizer(
filename: String,
k: usize,
m: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ/FASTA file {}", filename));
while let Some(record_res) = reader.next() {
let record = record_res.expect("Invalid FASTA/FASTQ record");
let seq_bytes = record.seq();
if seq_bytes.len() < m || seq_bytes.len() < k {
continue;
}
let seq_str = String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence");
let l = seq_str;
if l.len() < m || l.len() < k {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len() - m + 1;
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for &pos_u32 in &minimizer_positions {
let pos = pos_u32 as usize;
if pos + m > l.len() {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
*map.entry(minmer.to_string()).or_insert(0) += 1;
}
}
map
}
pub fn kmers_fq_pe(
filenames: Vec<&str>,
k: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for filename in filenames {
let mut reader = parse_fastx_file(filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ/FASTA file {}", filename));
while let Some(record_res) = reader.next() {
let record = record_res.expect("Invalid FASTA/FASTQ record");
let seq_bytes = record.seq();
if seq_bytes.len() < k {
continue;
}
let l = String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence");
let length_l = l.len();
if length_l < k {
continue;
}
let l_r = revcomp(&l);
for i in 0..=length_l - k {
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map.entry(l[i..i + k].to_string()).or_insert(0);
*count += 1;
} else {
let count = map
.entry(l_r[length_l - (i + k)..length_l - i].to_string())
.or_insert(0);
*count += 1;
}
}
}
}
map
}
pub fn kmers_fq_pe_qual(
filenames: Vec<&str>,
k: usize,
d: usize,
qual_offset: u8,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
let mut reader1 = parse_fastx_file(filenames[0])
.unwrap_or_else(|_| panic!("Error reading FASTQ file {}", filenames[0]));
let mut reader2 = parse_fastx_file(filenames[1])
.unwrap_or_else(|_| panic!("Error reading FASTQ file {}", filenames[1]));
let mut process_read = |seq_bytes: &[u8], qual_bytes: &[u8]| {
let seq_str =
String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence in PE read");
let qual_str =
String::from_utf8(qual_bytes.to_vec()).expect("Non UTF-8 quality string in PE read");
let masked = seq::qual_mask(&seq_str, &qual_str, qual_offset);
let l = masked;
let length_l = l.len();
if length_l < k {
return;
}
let l_r = revcomp(&l);
for i in 0..=length_l - k {
if i % d != 0 {
continue;
}
if !seq::has_no_n(l[i..i + k].as_bytes()) {
continue;
}
if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] {
let count = map.entry(l[i..i + k].to_string()).or_insert(0);
*count += 1;
} else {
let count = map
.entry(l_r[length_l - (i + k)..length_l - i].to_string())
.or_insert(0);
*count += 1;
}
}
};
loop {
let rec1_opt = reader1.next();
let rec2_opt = reader2.next();
let (rec1, rec2) = match (rec1_opt, rec2_opt) {
(Some(Ok(r1)), Some(Ok(r2))) => (r1, r2),
_ => break, };
let (q1, q2) = match (rec1.qual(), rec2.qual()) {
(Some(q1), Some(q2)) => (q1, q2),
_ => continue, };
process_read(&rec1.seq(), q1);
process_read(&rec2.seq(), q2);
}
map
}
pub fn kmers_fq_pe_minimizer(
filenames: Vec<&str>,
k: usize,
m: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
for filename in filenames {
let mut reader = parse_fastx_file(filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ/FASTA/PE file {}", filename));
while let Some(record_res) = reader.next() {
let record = record_res.expect("Invalid FASTA/FASTQ record");
let seq_bytes = record.seq();
if seq_bytes.len() < m || seq_bytes.len() < k {
continue;
}
let seq_str = String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence");
let l = seq_str;
if l.len() < m || l.len() < k {
continue;
}
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len() - m + 1;
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for &pos_u32 in &minimizer_positions {
let pos = pos_u32 as usize;
if pos + m > l.len() {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
*map.entry(minmer.to_string()).or_insert(0) += 1;
}
}
}
map
}
pub fn kmers_fq_pe_minimizer_qual(
filenames: Vec<&str>,
k: usize,
m: usize,
d: usize,
qual_offset: u8,
) -> fnv::FnvHashMap<std::string::String, usize> {
use needletail::parse_fastx_file;
let mut map = fnv::FnvHashMap::default();
let mut reader1 = parse_fastx_file(filenames[0])
.unwrap_or_else(|_| panic!("Error reading FASTQ file {}", filenames[0]));
let mut reader2 = parse_fastx_file(filenames[1])
.unwrap_or_else(|_| panic!("Error reading FASTQ file {}", filenames[1]));
let mut process_read = |seq_bytes: &[u8], qual_bytes: &[u8]| {
if seq_bytes.len() < m || seq_bytes.len() < k {
return;
}
let seq_str =
String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence (PE)");
let qual_str =
String::from_utf8(qual_bytes.to_vec()).expect("Non UTF-8 quality string (PE)");
let masked = crate::seq::qual_mask(&seq_str, &qual_str, qual_offset);
if masked.len() < m || masked.len() < k {
return;
}
let l = masked;
let bytes = l.as_bytes();
let packed = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len() - m + 1;
if n_mers == 0 {
return;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for (_idx, &pos_u32) in minimizer_positions.iter().enumerate() {
let pos = pos_u32 as usize;
if pos + m > l.len() {
continue;
}
if d > 1 && (pos % d != 0) {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
*map.entry(minmer.to_string()).or_insert(0) += 1;
}
};
loop {
let rec1_opt = reader1.next();
let rec2_opt = reader2.next();
let (rec1, rec2) = match (rec1_opt, rec2_opt) {
(Some(Ok(r1)), Some(Ok(r2))) => (r1, r2),
_ => break, };
let (q1, q2) = match (rec1.qual(), rec2.qual()) {
(Some(q1), Some(q2)) => (q1, q2),
_ => continue,
};
let s1 = rec1.seq();
let s2 = rec2.seq();
process_read(s1.as_ref(), q1.as_ref());
process_read(s2.as_ref(), q2.as_ref());
}
map
}
pub fn kmers_from_fq_minimizer_qual(
filename: String,
k: usize,
m: usize,
_d: usize,
qual_offset: u8,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map = fnv::FnvHashMap::default();
let mut reader = parse_fastx_file(&filename)
.unwrap_or_else(|_| panic!("Error reading FASTQ/FASTA file {}", filename));
while let Some(record_res) = reader.next() {
let record = record_res.expect("Invalid FASTA/FASTQ record");
let qual = match record.qual() {
Some(q) => q,
None => continue,
};
let seq_bytes = record.seq();
if seq_bytes.is_empty() {
continue;
}
let seq_str = String::from_utf8(seq_bytes.to_vec()).expect("Non UTF-8 sequence");
let qual_str = String::from_utf8(qual.to_vec()).expect("Non UTF-8 quality string");
let masked = seq::qual_mask(&seq_str, &qual_str, qual_offset);
let l = masked;
if l.len() < k || l.len() < m {
continue;
}
let bytes = l.as_bytes();
let packed_seq = PackedSeqVec::from_ascii(bytes);
let n_mers = bytes.len() - m + 1;
if n_mers == 0 {
continue;
}
let w = if k > m { k - m + 1 } else { 1 };
let hasher: NtHasher<true, 7> = NtHasher::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed_seq.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
for &pos_u32 in &minimizer_positions {
let pos = pos_u32 as usize;
if pos + m > l.len() {
continue;
}
let minmer = &l[pos..pos + m];
if minmer
.as_bytes()
.iter()
.any(|&b| b == b'N' || b == b'n')
{
continue;
}
let entry = map.entry(minmer.to_string()).or_insert(0);
*entry += 1;
}
}
map
}
pub fn clean_map(
map: fnv::FnvHashMap<std::string::String, usize>,
t: usize,
) -> fnv::FnvHashMap<std::string::String, usize> {
let mut map_clean = fnv::FnvHashMap::default();
for (key, value) in map {
if value > t {
map_clean.insert(key, value);
}
}
map_clean
}
pub fn revcomp(dna: &str) -> String {
let mut rc_dna = String::with_capacity(dna.len());
for c in dna.chars().rev() {
rc_dna.push(switch_base(&c))
}
rc_dna
}
fn switch_base(c: &char) -> char {
match c {
'a' => 't',
'c' => 'g',
't' => 'a',
'g' => 'c',
'u' => 'a',
'n' => 'n',
'A' => 'T',
'C' => 'G',
'T' => 'A',
'G' => 'C',
'U' => 'A',
'N' => 'N',
_ => 'N',
}
}
pub fn auto_cutoff(map: fnv::FnvHashMap<std::string::String, usize>) -> usize {
let mut histo_map = fnv::FnvHashMap::default();
let mut max_cov = 0;
for (_key, value) in &map {
if value > &max_cov {
max_cov = *value;
}
*histo_map.entry(value).or_insert(0) += 1;
}
let mut sum = 0;
for (i, p) in &histo_map {
sum += *i * p;
}
let num_kmers_total = map.len();
let total_mean: f64 = sum as f64 / num_kmers_total as f64;
if total_mean < 1.5 {
0
} else {
let mut count_vec = Vec::with_capacity(max_cov);
for c in 1..max_cov {
if !histo_map.contains_key(&c) {
count_vec.push((c, 0));
} else {
count_vec.push((c, *histo_map.get(&c).unwrap()));
}
}
let mut coverages: Vec<usize> = Vec::with_capacity(count_vec.len());
for v in count_vec {
coverages.push(v.1);
}
let mut d1 = Vec::new();
for i in 1..coverages.len() - 1 {
d1.push(coverages[i] as f64 / coverages[i + 1] as f64);
}
let mut d2 = Vec::new();
for i in 0..d1.len() - 1 {
d2.push(d1[i] / d1[i + 1]);
}
let mut first_pos_d1 = 0;
let mut first_pos_d2 = 0;
let threshold: f64 = 1.0;
for (i, p) in d1.iter().enumerate() {
if p < &threshold {
first_pos_d1 = i + 1;
break;
}
}
for (i, p) in d2.iter().enumerate() {
if p < &threshold {
first_pos_d2 = i + 1;
break;
}
}
let mut bigsum = 0;
for (i, p) in coverages[1..].iter().enumerate() {
bigsum += i * p;
}
let num_kmers: usize = coverages[1..].iter().sum();
let mean: f64 = bigsum as f64 / num_kmers as f64;
if (first_pos_d1 > 0) && ((first_pos_d1 as f64) < (mean * 0.75)) {
first_pos_d1
} else if first_pos_d2 > 0 {
first_pos_d2
} else {
cmp::max(1, (mean / 2.0).ceil() as usize)
}
}
}
pub fn find_minimizer(seq: &str, m: usize) -> String {
if m == 0 || seq.len() < m {
return String::new();
}
let bytes = seq.as_bytes();
let packed_seq = PackedSeqVec::from_ascii(bytes);
let n_kmers = bytes.len() - m + 1;
let w = n_kmers;
let hasher = <seq_hash::NtHasher>::new(m);
let mut minimizer_positions: Vec<u32> = Vec::new();
let _vals: Vec<u64> = canonical_minimizers(m, w)
.hasher(&hasher)
.run(packed_seq.as_slice(), &mut minimizer_positions)
.values_u64()
.collect();
if minimizer_positions.is_empty() {
return String::new();
}
let pos = minimizer_positions[0] as usize;
seq[pos..pos + m].to_string()
}