1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
use std::fs::File;
use std::io::{stdin, BufReader, Read};
use std::path::Path;
use memmap::MmapOptions;
use needletail::parse_fastx_reader;
use needletail::parser::Format;
use rayon::prelude::*;
use crate::filtering::FilterParams;
use crate::serialization::{
read_finch_file, read_mash_file, MultiSketch, Sketch, FINCH_BIN_EXT, FINCH_EXT, MASH_EXT,
};
use crate::sketch_schemes::SketchParams;
pub mod distance;
pub mod filtering;
pub mod sketch_schemes;
pub mod errors;
#[cfg(feature = "python")]
pub mod python;
pub mod serialization;
pub mod statistics;
use crate::errors::FinchResult;
pub fn sketch_files(
filenames: &[&str],
sketch_params: &SketchParams,
filters: &FilterParams,
) -> FinchResult<Vec<Sketch>> {
let sketches: FinchResult<Vec<Sketch>> = filenames
.par_iter()
.map(|filename| {
let reader: Box<dyn Read + Send> = if filename == &"-" {
Box::new(stdin())
} else {
Box::new(File::open(&Path::new(filename))?)
};
Ok(sketch_stream(reader, filename, sketch_params, &filters)?)
})
.collect();
sketches
}
pub fn sketch_stream<'a>(
reader: Box<dyn Read + Send + 'a>,
name: &str,
sketch_params: &SketchParams,
filters: &FilterParams,
) -> FinchResult<Sketch> {
let mut filter_params = filters.clone();
let mut sketcher = sketch_params.create_sketcher();
let mut fastx_reader = parse_fastx_reader(reader).expect("valid file TODO");
let mut seq_type = None;
while let Some(record) = fastx_reader.next() {
let seqrec = record.expect("invalid record");
if seq_type.is_none() {
seq_type = Some(seqrec.format());
}
sketcher.process(seqrec);
}
if filter_params.filter_on.is_none() {
filter_params.filter_on = match seq_type.expect("Should have got a type") {
Format::Fasta => Some(false),
Format::Fastq => Some(true),
};
}
let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers();
let hashes = sketcher.to_vec();
let mut filtered_hashes = filter_params.filter_counts(&hashes);
sketch_params.process_post_filter(&mut filtered_hashes, name)?;
Ok(Sketch {
name: name.to_string(),
seq_length,
num_valid_kmers,
comment: "".to_string(),
hashes: filtered_hashes,
filter_params,
sketch_params: sketch_params.clone(),
})
}
pub fn open_sketch_file<P: AsRef<Path>>(path: P) -> FinchResult<Vec<Sketch>> {
let p = path.as_ref();
let filename = p
.file_name()
.ok_or_else(|| format_err!("Path does not have a filename: {:?}", p))?
.to_string_lossy();
let file = File::open(p).map_err(|_| format_err!("Error opening {:?}", p))?;
if filename.ends_with(MASH_EXT) {
let mut buf_reader = BufReader::new(file);
read_mash_file(&mut buf_reader)
} else if filename.ends_with(FINCH_BIN_EXT) {
let mut buf_reader = BufReader::new(file);
read_finch_file(&mut buf_reader)
} else if filename.ends_with(FINCH_EXT) || filename.ends_with(".json") {
let mapped = unsafe { MmapOptions::new().map(&file)? };
let multisketch: MultiSketch =
serde_json::from_slice(&mapped).map_err(|_| format_err!("Error parsing {:?}", &p))?;
multisketch.to_sketches()
} else {
Err(format_err!("File suffix is not *.bsk, *.msh, or *.sk"))
}
}