//! Counts kmers.
//! Each line is a kmer with two columns separated by tab: kmer, count
//! Optional columns starting with column 3 are the reads that start with that kmer
//! with a delimiter of `~`
//! # Examples
//! Counting kmers of 15. Using `--paired-end` will not matter here.
//!
//! ```bash
//! cat testdata/four_reads.fastq | fasten_kmer -k 15 > 15mers.tsv
//! ```
//!
//! Counting kmers and retaining reads
//!
//! ```bash
//! cat testdata/four_reads.fastq | \
//! fasten_kmer -k 15 --remember-reads > 15mers.tsv
//! ```
//!
//! # Example output
//!
//! First two lines of a kmer output where they contain reads
//!
//! ```text
//! TAGTGAATCCTTTTTCATAAA 39 @M03235:53:000000000-AHLTD:1:1113:23312:4764 1:N:0:6~TAGTGAATCCTTTTTCATAAAATCTTGCTTCAAAATTGCTAAGAGTTTATAAGCAAGAAGTGTTCCAAGTTTGCAAGATGAGGTGAGATTGTGTAAATAAGCTACAAAATTTTTAATTTAAGCCCTACAAGCTCTTAAATATCAAAAGCATTTTCTAAAATATGCAAAAATGTAAGCAAAATGTTTAAAGGAAAGTCGTGAAAAATGCTGAAAAAACTTTAAGAAGGAATTTTTTTACCCTAATCTTACTT~+~>AAA>DDFFFFFGGGGGGGGGGHHHHHHHHHHHHHHHHHHHHGHHGHHHHHHHHFHHHHHGGHHHHHHHGHHHHHHGHHHHHHCHGHHFHHHHGHHHHHHHHHHHGHHHGHHHHHHHHHHHHHHHGHHHGHHFHGHFHHHHHHGHHHFHHHHHHHGHHHHHHHHGHHHHHHHHGHHFHHHHHHHHHHHHHHHGHGFHAFDHGHFHFHHGHGHHFHGFFGGHHHHHFGHFHB=0D::GCGHBHHFBCGGGGG~@M03235:53:000000000-AHLTD:1:1113:23312:4764 2:N:0:6~TCGTAGTAGTATTTCCTAAAATAAGGCAAACCATAGATGATAGACCCACAAAAAGAAAGTAAGATTAGGGTAAAAAAATTCCTTCTTAAAGTTTTTTCAGCATTTTTCACGACTTTCCTTTAAACATTTTGCTTACATTTTTGCATATTTTAGAAAATGCTTTTGATATTTAAGAGCTTGTAGGGCTTAAATTAAAAATTTTGTAGCTTATTTACACAATCTCACCTCATCTTGCAAACTTGGAACACTT~+~CCCCDCCFFFFFGGGGGGGGGGHHHHHHHHHGHHHHHHHHHHGHHHGHGGGHHHGGGHHHHFFHHFHHHFEGGHHHGGHHFHHHHHHHGHHFDGHGGFHHHHHHHHHHGHHGGGGGHHGGHHGHHHHHHHHHGHHHHHHHHHGGHHHHHHHHHHHHGHFFHHHHHGHGHHHFHFHHFHHHHHFBFFHHHEDHHHGFHHHHGHGHHDHBGGHHGHHFDGEHHHFFHHFHGHHHHGFC::CBFFBBFF/CFB
//! TATCAAGGCTGCTCAAATGAT 35 @M03235:53:000000000-AHLTD:1:1114:18962:2371 1:N:0:6~TATCAAGGCTGCTCAAATGATGGCTTTTGTTATGCTCCGCAAAAGCGTGAATTTAGAATTTTTAAAGAGGGTCAAATTTATAAAACTAGCCCTTATGAAACAATGCAAAGTGAAGAAGAGCAAATCGCCTTTTCTTTGAAAAATGAAAATTTAGCACTCATCTTGCTTAGTTTTTTTGGTTACGGACTTTTGCTTTCTCTTACGCCTTGCACCTTACCGATGATTCCTATTTTATCTTCACTTATCATAG~+~AABBA5FBAFFBGGGGGGGGGGHGHHHFHHHHHHHHHCGGGGGBHFFEE2FHHFHHHFGGHHHGHHHFHGGGHGHHHHGHHHHHHHHHHHHFHHHGHHHHFFHHHHHHHHHHGHHHCGGGH3FHEGDAFGGGGHHFGHFHHEHHHGHFFFHHGEHHHHGHHHFHFFHHHHDFHHGCFDGHEHFEGDCCHHHBBG0GFHFHHBGGF-G?BGGGCGCG//;.9.CBFB0BBGGGGBFFFF0;0FFGFGBF00~@M03235:53:000000000-AHLTD:1:1114:18962:2371 2:N:0:6~GATTAAAGAAAGTAAAAAGCTTTGTTTTTTAGAAGGTTTCGTGCCACCTTTTGCTATGATAAGTGAAGATAAAATAGGAATCATCGGTAAGGTGCAAGGCGTAAGAGAAAGCAAAAGTCCGTAACCAAAAAAACTAAGCAAGATGAGTGCTAAATTTTCATTTTTCAAAGAAAAGGCGATTTGCTCTTCTTCACTTTGCATTGTTTCATAAGGGCTAGTTTTATAAATTTGACCCTCTTTAAAAATTCTAA~+~CCCCCFFFFFFFGGGGGGGGGGHHFHHHGGHHGGHHGHHHGHGGHHHGHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHGGGEGFGFHHFHGHGGGEHGHHHHHHGHHHHFHFE?GEGHHHHGGGGGGGHHHHGHHHHHHHFDFHHHHGFHFHHGHHGHHHHHHHHHH.A@EGGC0D0G0D0GDHFHHFCC00FGFHHHHHHHFHB;EFGGFGGBFGEFGGFFFFGCBFGGGGGGBFGGFGFFF
//! ```
//! # Usage
//! ```text
//! Usage: fasten_kmer [-h] [-n INT] [-p] [-v] [-k INT]
//! Options:
//! -h, --help Print this help menu.
//! -n, --numcpus INT Number of CPUs (default: 1)
//! -p, --paired-end The input reads are interleaved paired-end
//! -v, --verbose Print more status messages
//! -k, --kmer-length INT
//! The size of the kmer
//! -r, --revcomp Count kmers on the reverse complement strand too
//! -m, --remember-reads
//! Add reads to subsequent columns. Each read begins with
//! the kmer. Only lists reads in the forward direction.
//! ```
extern crate fasten;
extern crate statistical;
extern crate getopts;
extern crate rand;
extern crate fancy_regex;
use std::io::BufReader;
use std::io::BufRead;
use std::io::stdin;
use std::io::Stdin;
use rand::Rng;
//use rand::seq::SliceRandom;
use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;
use fancy_regex::Regex;
use std::collections::HashMap;
/// Glues together paired end reads internally and is a
/// character not expected in any read
const READ_SEPARATOR :char = '~';
#[test]
/// Let's count some kmers on homopolymers
fn test_kmer_counting_homopolymers () {
let k = 4;
// The first easy test is an AAAA kmer in a Ax10 homopolymer
let a_homopolymer = (0..10).map(|_| "A").collect::<String>();
let mut kmer = kmers_in_str(&a_homopolymer, k, false);
let obs_a_count = *kmer.entry(String::from("AAAA")).or_insert(0);
assert_eq!(obs_a_count, 7, "10-mer A yields seven 4-mers of A");
let obs_t_count = *kmer.entry(String::from("TTTT")).or_insert(0);
assert_eq!(obs_t_count, 0, "10-mer A yields zero 4-mers of T");
// Ok but what about revcom kmers
let mut kmer = kmers_in_str(&a_homopolymer, k, true );
let obs_a_count = *kmer.entry(String::from("AAAA")).or_insert(0);
assert_eq!(obs_a_count, 7, "10-mer A yields seven 4-mers of A");
let obs_t_count = *kmer.entry(String::from("TTTT")).or_insert(0);
assert_eq!(obs_t_count, 7, "10-mer A yields seven 4-mers of T");
}
#[test]
/// Let's count some kmers on something more complicated
fn test_kmer_counting_4mers () {
let k = 4;
// This is the first seq of the test file four_reads.fastq
let seq = "AAAGTGCTCTTAACTTGTCCCGCTCCACATCAGCGCGACATCAATCGACATTAAACCGAGTATCTTGTCAGCCTGGGGTGACGATGCGTCCCATTAAAGT";
let my_kmer = "TTAA";
let mut kmer = kmers_in_str(&seq, k, false);
let obs = *kmer.entry(String::from(my_kmer)).or_insert(0);
assert_eq!(obs, 3, "Found {} kmer in seq {} times. Seq is \n{}", my_kmer, obs, seq);
// But also test for revcom kmers
let mut kmer = kmers_in_str(&seq, k, true );
let obs = *kmer.entry(String::from(my_kmer)).or_insert(0);
assert_eq!(obs, 6, "Found {} kmer in seq {} times. Seq is \n{}", my_kmer, obs, seq);
// one more kmer
let my_kmer = "TGTC";
let mut kmer = kmers_in_str(&seq, k, false);
let obs = *kmer.entry(String::from(my_kmer)).or_insert(0);
assert_eq!(obs, 2, "Found {} kmer in seq {} times. Seq is \n{}", my_kmer, obs, seq);
// and revcom
let mut kmer = kmers_in_str(&seq, k, true );
let obs = *kmer.entry(String::from(my_kmer)).or_insert(0);
assert_eq!(obs, 4, "Found {} kmer in seq {} times. Seq is \n{}", my_kmer, obs, seq);
}
fn main(){
let mut opts = fasten_base_options();
// script-specific options
let default_k:usize = 21;
opts.optopt("k","kmer-length",&format!("The size of the kmer (default: {})",default_k),"INT");
opts.optflag("r","revcomp", "Count kmers on the reverse complement strand too");
opts.optflag("m","remember-reads", "Add reads to subsequent columns. Each read begins with the kmer. Only lists reads in the forward direction.");
let matches = fasten_base_options_matches("Counts kmers.", opts);
//if matches.opt_present("paired-end") {
// logmsg("WARNING: --paired-end is not utilized in this script");
//}
let kmer_length:usize={
if matches.opt_present("kmer-length") {
matches.opt_str("kmer-length")
.expect("ERROR: could not understand parameter --kmer-length")
.parse()
.expect("ERROR: --kmer-length is not an INT")
} else {
default_k
}
};
let stdin = stdin();
count_kmers(stdin, kmer_length, matches.opt_present("revcomp"), matches.opt_present("remember-reads"), matches.opt_present("paired-end"));
}
/// Read fastq from stdin and count kmers
fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:bool, paired_end:bool) {
// keep track of which sequences start with which kmers
let mut kmer_to_seqs :HashMap<String, Vec<String>> = HashMap::new();
// Some randomness
let mut rng = rand::thread_rng();
// Regular expression to find uncomplex kmers
let low_complexity = Regex::new(r"N|(.)\1{2,}|(.)(.)(\2\3){1,}").unwrap();
// read the file
let my_buffer=BufReader::new(stdin);
let mut buffer_iter = my_buffer.lines();
let mut kmer_hash :HashMap<String,u32> = HashMap::new();
while let Some(id_opt) = buffer_iter.next() {
let seq = buffer_iter.next().expect("ERROR reading a sequence line")
.expect("ERROR reading a sequence line");
// burn the plus line
let plus_opt = buffer_iter.next();
// burn the qual line
let qual_opt = buffer_iter.next();
// get all the kmers in this entry
let entry_kmers = kmers_in_str(&seq, kmer_length, revcomp);
// merge the entry kmers
for (key, value) in entry_kmers.iter() {
let kmer_count = kmer_hash.entry(String::from(key)).
or_insert(0);
*kmer_count += value;
}
let kmer_keys: Vec<&String> = entry_kmers.keys().collect();
// If this is paired end and if we're saving the second pair's
// read, then reserve a declaired variable here for the string.
let mut r2_read_string :String = String::new();
if paired_end {
let id2 = buffer_iter.next().expect("reading the ID2 line")
.expect("reading the ID2 line");
let seq2 = buffer_iter.next().expect("ERROR reading a sequence line, second in pair")
.expect("ERROR reading a sequence line, second in pair");
// burn the plus line
let plus2 = buffer_iter.next().expect("reading the plus2 line")
.expect("reading the plus2 line");
// burn the qual line
let qual2 = buffer_iter.next().expect("reading the qual2 line")
.expect("reading the qual2 line");
// get all the kmers in this entry
let entry_kmers2 = kmers_in_str(&seq2, kmer_length, revcomp);
// merge the entry kmers
for (key, value) in entry_kmers2.iter() {
let kmer_count = kmer_hash.entry(String::from(key)).
or_insert(0);
*kmer_count += value;
}
r2_read_string = vec![id2,seq2,plus2,qual2].join(&READ_SEPARATOR.to_string());
}
// Remember the read that initiated this
if remember_reads {
// Get the kmer substring
// let init_kmer = String::from(&seq[0..kmer_length]);
// Get the minimizer as the initial kmer
//let init_kmer = calculate_minimizer(&seq, kmer_length);
//let init_kmer_pos = rand::thread_rng().gen_range(0 .. seq.len() - kmer_length + 1);
//let init_kmer = seq[init_kmer_pos .. init_kmer_pos+kmer_length].to_string();
let mut random_index = rng.gen_range(0..kmer_keys.len());
let mut init_kmer = kmer_keys[random_index].to_string();
let mut kmer_tries :u16 = 0;
while kmer_tries < 1000 && low_complexity.is_match(&init_kmer).unwrap() {
random_index = rng.gen_range(0..kmer_keys.len());
init_kmer = kmer_keys[random_index].to_string();
kmer_tries += 1;
//fasten::logmsg(format!("skipping kmer {}, try {}",&init_kmer, kmer_tries));
}
// Get the vector of sequences for this kmer, or else initialize an empty vector
let init_kmer_vec = kmer_to_seqs.entry(init_kmer).or_insert(vec![]);
// get the formatted entry
let id = id_opt.expect("reading the ID line");
let plus = plus_opt.expect("reading the plus line")
.expect("reading the plus line");
let qual = qual_opt.expect("reading the qual line")
.expect("reading the qual line");
if paired_end {
init_kmer_vec.push(
vec![id, seq, plus, qual, r2_read_string].join(&READ_SEPARATOR.to_string())
);
}
else {
init_kmer_vec.push(
vec![id, seq, plus, qual].join(&READ_SEPARATOR.to_string())
);
}
}
}
// TODO in the future: somehow efficiently remove reverse
// complement kmers before printing because it is basically
// double the information needed.
for (kmer,count) in kmer_hash.iter() {
let mut line :String = format!("{}\t{}", kmer, count);
if remember_reads {
let reads_vec = kmer_to_seqs.entry(kmer.to_string()).or_insert(vec![]);
for read in reads_vec {
line.push_str("\t");
line.push_str(read);
}
}
println!("{}", line);
}
}
/// Read a str of nucleotides and count kmers.
/// If `should_revcomp` is true, then will also count kmers on the opposite strand.
fn kmers_in_str (seq:&str, kmer_length:usize, should_revcomp:bool) -> HashMap<String,u32> {
// save the kmers in this local hash
let mut kmer_hash :HashMap<String,u32> = HashMap::new();
// how many kmers we expect in this sliding window
let seq_len = seq.len();
// Don't count short sequences
if seq_len < kmer_length {
logmsg("WARNING: found a sequence less than k");
return kmer_hash;
}
let my_num_kmers = seq_len - kmer_length + 1;
for idx in 0..my_num_kmers {
// increment the kmer count by reference
let kmer_count = kmer_hash.entry(String::from(&seq[idx..kmer_length+idx])).
or_insert(0);
*kmer_count += 1;
}
// kmer count on the revcomp sequence too
if should_revcomp {
let revcomp = revcomp(&seq);
for idx in 0..my_num_kmers {
let kmer_count = kmer_hash.entry(String::from(&revcomp[idx..kmer_length+idx]))
.or_insert(0);
*kmer_count += 1;
}
}
return kmer_hash;
}
/*
fn calculate_minimizer(sequence: &str, k: usize) -> String {
// Ensure the sequence length is greater than or equal to k
if sequence.len() < k {
panic!("Sequence length is less than k");
}
// Initialize variables to keep track of the minimum k-mer and its hash
let mut min_kmer = &sequence[0..k];
let mut min_hash = hash_kmer(min_kmer);
// Iterate over the sequence to find the minimum k-mer and its hash
for i in 1..=(sequence.len() - k) {
let current_kmer = &sequence[i..(i + k)];
let current_hash = hash_kmer(current_kmer);
// Update the minimum k-mer and its hash if a smaller hash is found
if current_hash < min_hash {
min_kmer = current_kmer;
min_hash = current_hash;
}
}
// Return the minimizer k-mer
min_kmer.to_string()
}
fn hash_kmer(kmer: &str) -> u64 {
// Simple hash function that converts each nucleotide to its ASCII value and sums them up
kmer.bytes().map(|b| b as u64).sum()
}
*/
/// reverse-complement a dna sequence
// Thanks Henk for supplying these functions.
fn revcomp(dna: &str) -> String {
let mut rc_dna: String = String::with_capacity(dna.len());
for c in dna.chars().rev() {
rc_dna.push(switch_base(c))
}
rc_dna
}
/// Complementary nucleotide for ACTGUN, case insensitive
fn switch_base(c: char) -> char {
match c {
'a' => 't',
'c' => 'g',
't' => 'a',
'g' => 'c',
'u' => 'a',
'n' => 'n',
'A' => 'T',
'C' => 'G',
'T' => 'A',
'G' => 'C',
'U' => 'A',
'N' => 'N',
_ => 'N',
}
}