1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
extern crate needletail;
extern crate murmurhash3;
extern crate serde;
#[macro_use] extern crate serde_derive;
extern crate serde_json;

use std::io::{Read, Seek};
use std::path::Path;
use needletail::fastx::{fastx_cli, fastx_stream};

use filtering::{FilterParams, filter_sketch};
use minhashes::MinHashKmers;
use serialization::{JSONSketch, JSONMultiSketch};

pub mod minhashes;
pub mod filtering;
pub mod distance;
pub mod serialization;
pub mod statistics;

pub fn mash_files(filenames: Vec<&str>, n_hashes: usize, final_size: usize, kmer_length: u8, filters: &mut FilterParams, no_strict: bool, seed: u64) -> Result<JSONMultiSketch, String> {
    let mut sketches = Vec::with_capacity(filenames.len());
    for filename in &filenames {
        let mut seq_len = 0u64;
        let mut n_kmers = 0u64;
        let path = Path::new(filename);
        let mut minhash = match filters.filter_on {
            Some(true) | None => MinHashKmers::new(n_hashes, seed),
            Some(false) => MinHashKmers::new(final_size, seed),
        };
        fastx_cli(path.to_str().ok_or("Couldn't make path into string")?, |seq_type| {
            // disable filtering for FASTA files unless it was explicitly specified
            if let None = filters.filter_on {
                filters.filter_on = match seq_type {
                    "FASTA" => Some(false),
                    "FASTQ" => Some(true),
                    _ => panic!("Unknown sequence type"),
                };
            }
        }, |seq| {
            seq_len += seq.seq.len() as u64;
            for (_, kmer, is_rev_complement) in seq.normalize(false).kmers(kmer_length, true) {
                let rc_count = match is_rev_complement {
                    true => 1u8,
                    false => 0u8,
                };
                n_kmers += 1;
                minhash.push(kmer, rc_count);
            }
        }).map_err(|e| e.to_string())?;

        let hashes = minhash.into_vec();
        let (mut filtered_hashes, filter_stats) = filter_sketch(&hashes, &filters);
        filtered_hashes.truncate(final_size);
        if !no_strict && filtered_hashes.len() < final_size {
            return Err(format!("{} had too few kmers ({}) to sketch", filename, filtered_hashes.len()));
        }

        // directory should be clipped from filename
        let basename = path.file_name().ok_or("Couldn't get filename from path")?;
        let sketch = JSONSketch::new(basename.to_str().ok_or("Couldn't make filename into string")?,
                                     seq_len, n_kmers, filtered_hashes, &filter_stats);
        sketches.push(sketch);
    }
    Ok(JSONMultiSketch {
        kmer: kmer_length,
        alphabet: String::from("ACGT"),
        preserveCase: false,
        canonical: true,
        sketchSize: final_size as u32,
        hashType: String::from("MurmurHash3_x64_128"),
        hashBits: 64u16,
        hashSeed: seed,
        sketches: sketches,
    })
}


pub fn mash_stream<R>(reader: R, n_hashes: usize, final_size: usize, kmer_length: u8,
                      filters: &mut FilterParams, no_strict: bool, seed: u64) -> Result<JSONSketch, String> where
    R: Read + Seek,
{
        let mut seq_len = 0u64;
        let mut n_kmers = 0u64;
        let mut minhash = match filters.filter_on {
            Some(true) | None => MinHashKmers::new(n_hashes, seed),
            Some(false) => MinHashKmers::new(final_size, seed),
        };
        fastx_stream(reader, |seq_type| {
            // disable filtering for FASTA files unless it was explicitly specified
            if let None = filters.filter_on {
                filters.filter_on = match seq_type {
                    "FASTA" => Some(false),
                    "FASTQ" => Some(true),
                    _ => panic!("Unknown sequence type"),
                };
            }
        }, |seq| {
            seq_len += seq.seq.len() as u64;
            for (_, kmer, is_rev_complement) in seq.normalize(false).kmers(kmer_length, true) {
                let rc_count = match is_rev_complement {
                    true => 1u8,
                    false => 0u8,
                };
                n_kmers += 1;
                minhash.push(kmer, rc_count);
            }
        }).map_err(|e| e.to_string())?;

        let hashes = minhash.into_vec();
        let (mut filtered_hashes, filter_stats) = filter_sketch(&hashes, &filters);
        filtered_hashes.truncate(final_size);
        if !no_strict && filtered_hashes.len() < final_size {
            return Err(format!("Stream had too few kmers ({}) to sketch", filtered_hashes.len()));
        }

        Ok(JSONSketch::new("", seq_len, n_kmers, filtered_hashes, &filter_stats))
}