1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
use std::fs::File;
use std::io::{stdin, BufReader, Read};
use std::path::Path;

use memmap::MmapOptions;
use needletail::parse_fastx_reader;
use needletail::parser::Format;
use rayon::prelude::*;

use crate::filtering::FilterParams;
use crate::serialization::{
    read_finch_file, read_mash_file, MultiSketch, Sketch, FINCH_BIN_EXT, FINCH_EXT, MASH_EXT,
};
use crate::sketch_schemes::SketchParams;

pub mod distance;
pub mod filtering;
pub mod sketch_schemes;
// it would be nice if there was a `pub(in main)` or something for
// main_parsing so we don't import it for `lib` itself
pub mod errors;
#[cfg(feature = "python")]
pub mod python;
pub mod serialization;
pub mod statistics;

use crate::errors::FinchResult;

pub fn sketch_files(
    filenames: &[&str],
    sketch_params: &SketchParams,
    filters: &FilterParams,
) -> FinchResult<Vec<Sketch>> {
    let sketches: FinchResult<Vec<Sketch>> = filenames
        .par_iter()
        .map(|filename| {
            // open the file with a special case to handle stdin
            let reader: Box<dyn Read + Send> = if filename == &"-" {
                // We're not locking it so technically not thread safe
                Box::new(stdin())
            } else {
                Box::new(File::open(&Path::new(filename))?)
            };
            // sketch!
            Ok(sketch_stream(reader, filename, sketch_params, &filters)?)
        })
        .collect();
    sketches
}

pub fn sketch_stream<'a>(
    reader: Box<dyn Read + Send + 'a>,
    name: &str,
    sketch_params: &SketchParams,
    filters: &FilterParams,
) -> FinchResult<Sketch> {
    let mut filter_params = filters.clone();
    let mut sketcher = sketch_params.create_sketcher();
    // TODO: remove expects after removing failure
    let mut fastx_reader = parse_fastx_reader(reader).expect("valid file TODO");
    let mut seq_type = None;
    while let Some(record) = fastx_reader.next() {
        let seqrec = record.expect("invalid record");
        if seq_type.is_none() {
            seq_type = Some(seqrec.format());
        }
        sketcher.process(seqrec);
    }

    // disable filtering for FASTA files unless it was explicitly specified
    if filter_params.filter_on.is_none() {
        filter_params.filter_on = match seq_type.expect("Should have got a type") {
            Format::Fasta => Some(false),
            Format::Fastq => Some(true),
        };
    }

    let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers();
    let hashes = sketcher.to_vec();

    // do filtering
    let mut filtered_hashes = filter_params.filter_counts(&hashes);
    sketch_params.process_post_filter(&mut filtered_hashes, name)?;

    Ok(Sketch {
        name: name.to_string(),
        seq_length,
        num_valid_kmers,
        comment: "".to_string(),
        hashes: filtered_hashes,
        filter_params,
        sketch_params: sketch_params.clone(),
    })
}

pub fn open_sketch_file<P: AsRef<Path>>(path: P) -> FinchResult<Vec<Sketch>> {
    let p = path.as_ref();
    let filename = p
        .file_name()
        .ok_or_else(|| format_err!("Path does not have a filename: {:?}", p))?
        .to_string_lossy();
    let file = File::open(p).map_err(|_| format_err!("Error opening {:?}", p))?;
    if filename.ends_with(MASH_EXT) {
        let mut buf_reader = BufReader::new(file);
        read_mash_file(&mut buf_reader)
    } else if filename.ends_with(FINCH_BIN_EXT) {
        let mut buf_reader = BufReader::new(file);
        read_finch_file(&mut buf_reader)
    } else if filename.ends_with(FINCH_EXT) || filename.ends_with(".json") {
        let mapped = unsafe { MmapOptions::new().map(&file)? };
        let multisketch: MultiSketch =
            serde_json::from_slice(&mapped).map_err(|_| format_err!("Error parsing {:?}", &p))?;
        multisketch.to_sketches()
    } else {
        Err(format_err!("File suffix is not *.bsk, *.msh, or *.sk"))
    }
}