1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#[cfg(feature = "mash_format")]
extern crate capnp;
#[macro_use] extern crate failure;
extern crate needletail;
extern crate murmurhash3;
extern crate serde;
#[macro_use] extern crate serde_derive;
extern crate serde_json;
use std::io::{Read, Seek};
use std::path::Path;
use std::result::Result as StdResult;
use failure::Error;
use needletail::fastx::{fastx_cli, fastx_stream};
use filtering::{FilterParams, filter_sketch};
use minhashes::MinHashKmers;
use serialization::{Sketch, MultiSketch};
pub mod minhashes;
pub mod filtering;
pub mod distance;
pub mod serialization;
pub mod statistics;
#[cfg(feature = "mash_format")]
mod mash_capnp;
pub type Result<T> = StdResult<T, Error>;
pub fn mash_files(filenames: Vec<&str>, n_hashes: usize, final_size: usize, kmer_length: u8, filters: &mut FilterParams, no_strict: bool, seed: u64) -> Result<MultiSketch> {
let mut sketches = Vec::with_capacity(filenames.len());
for filename in &filenames {
let mut seq_len = 0u64;
let mut n_kmers = 0u64;
let path = Path::new(filename);
let mut minhash = match filters.filter_on {
Some(true) | None => MinHashKmers::new(n_hashes, seed),
Some(false) => MinHashKmers::new(final_size, seed),
};
fastx_cli(path.to_str().ok_or(format_err!("Couldn't make path into string"))?, |seq_type| {
if let None = filters.filter_on {
filters.filter_on = match seq_type {
"FASTA" => Some(false),
"FASTQ" => Some(true),
_ => panic!("Unknown sequence type"),
};
}
}, |seq| {
seq_len += seq.seq.len() as u64;
for (_, kmer, is_rev_complement) in seq.normalize(false).kmers(kmer_length, true) {
let rc_count = match is_rev_complement {
true => 1u8,
false => 0u8,
};
n_kmers += 1;
minhash.push(kmer, rc_count);
}
}).map_err(|e| format_err!("{}", e.to_string()))?;
let hashes = minhash.into_vec();
let (mut filtered_hashes, filter_stats) = filter_sketch(&hashes, &filters);
filtered_hashes.truncate(final_size);
if !no_strict && filtered_hashes.len() < final_size {
bail!("{} had too few kmers ({}) to sketch", filename, filtered_hashes.len());
}
let basename = path.file_name().ok_or(format_err!("Couldn't get filename from path"))?;
let sketch = Sketch::new(basename.to_str().ok_or(format_err!("Couldn't make filename into string"))?,
seq_len, n_kmers, filtered_hashes, &filter_stats);
sketches.push(sketch);
}
Ok(MultiSketch {
kmer: kmer_length,
alphabet: String::from("ACGT"),
preserveCase: false,
canonical: true,
sketchSize: final_size as u32,
hashType: String::from("MurmurHash3_x64_128"),
hashBits: 64u16,
hashSeed: seed,
sketches: sketches,
})
}
pub fn mash_stream<R>(reader: R, n_hashes: usize, final_size: usize, kmer_length: u8,
filters: &mut FilterParams, no_strict: bool, seed: u64) -> Result<Sketch> where
R: Read + Seek,
{
let mut seq_len = 0u64;
let mut n_kmers = 0u64;
let mut minhash = match filters.filter_on {
Some(true) | None => MinHashKmers::new(n_hashes, seed),
Some(false) => MinHashKmers::new(final_size, seed),
};
fastx_stream(reader, |seq_type| {
if let None = filters.filter_on {
filters.filter_on = match seq_type {
"FASTA" => Some(false),
"FASTQ" => Some(true),
_ => panic!("Unknown sequence type"),
};
}
}, |seq| {
seq_len += seq.seq.len() as u64;
for (_, kmer, is_rev_complement) in seq.normalize(false).kmers(kmer_length, true) {
let rc_count = match is_rev_complement {
true => 1u8,
false => 0u8,
};
n_kmers += 1;
minhash.push(kmer, rc_count);
}
}).map_err(|e| format_err!("{}", e.to_string()))?;
let hashes = minhash.into_vec();
let (mut filtered_hashes, filter_stats) = filter_sketch(&hashes, &filters);
filtered_hashes.truncate(final_size);
if !no_strict && filtered_hashes.len() < final_size {
bail!("Stream had too few kmers ({}) to sketch", filtered_hashes.len());
}
Ok(Sketch::new("", seq_len, n_kmers, filtered_hashes, &filter_stats))
}