1use std::fs::File;
2use std::io::{stdin, BufReader, Read};
3use std::path::Path;
4
5use memmap::MmapOptions;
6use needletail::parse_fastx_reader;
7use needletail::parser::Format;
8use rayon::prelude::*;
9
10use crate::filtering::FilterParams;
11use crate::serialization::{
12 read_finch_file, read_mash_file, MultiSketch, Sketch, FINCH_BIN_EXT, FINCH_EXT, MASH_EXT,
13};
14use crate::sketch_schemes::SketchParams;
15
16pub mod distance;
17pub mod filtering;
18pub mod sketch_schemes;
19pub mod errors;
22#[cfg(feature = "python")]
23pub mod python;
24pub mod serialization;
25pub mod statistics;
26
27use crate::errors::FinchResult;
28
29pub fn sketch_files(
30 filenames: &[&str],
31 sketch_params: &SketchParams,
32 filters: &FilterParams,
33) -> FinchResult<Vec<Sketch>> {
34 let sketches: FinchResult<Vec<Sketch>> = filenames
35 .par_iter()
36 .map(|filename| {
37 let reader: Box<dyn Read + Send> = if filename == &"-" {
39 Box::new(stdin())
41 } else {
42 Box::new(File::open(&Path::new(filename))?)
43 };
44 Ok(sketch_stream(reader, filename, sketch_params, &filters)?)
46 })
47 .collect();
48 sketches
49}
50
51pub fn sketch_stream<'a>(
52 reader: Box<dyn Read + Send + 'a>,
53 name: &str,
54 sketch_params: &SketchParams,
55 filters: &FilterParams,
56) -> FinchResult<Sketch> {
57 let mut filter_params = filters.clone();
58 let mut sketcher = sketch_params.create_sketcher();
59 let mut fastx_reader = parse_fastx_reader(reader).expect("valid file TODO");
61 let mut seq_type = None;
62 while let Some(record) = fastx_reader.next() {
63 let seqrec = record.expect("invalid record");
64 if seq_type.is_none() {
65 seq_type = Some(seqrec.format());
66 }
67 sketcher.process(seqrec);
68 }
69
70 if filter_params.filter_on.is_none() {
72 filter_params.filter_on = match seq_type.expect("Should have got a type") {
73 Format::Fasta => Some(false),
74 Format::Fastq => Some(true),
75 };
76 }
77
78 let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers();
79 let hashes = sketcher.to_vec();
80
81 let mut filtered_hashes = filter_params.filter_counts(&hashes);
83 sketch_params.process_post_filter(&mut filtered_hashes, name)?;
84
85 Ok(Sketch {
86 name: name.to_string(),
87 seq_length,
88 num_valid_kmers,
89 comment: "".to_string(),
90 hashes: filtered_hashes,
91 filter_params,
92 sketch_params: sketch_params.clone(),
93 })
94}
95
96pub fn open_sketch_file<P: AsRef<Path>>(path: P) -> FinchResult<Vec<Sketch>> {
97 let p = path.as_ref();
98 let filename = p
99 .file_name()
100 .ok_or_else(|| format_err!("Path does not have a filename: {:?}", p))?
101 .to_string_lossy();
102 let file = File::open(p).map_err(|_| format_err!("Error opening {:?}", p))?;
103 if filename.ends_with(MASH_EXT) {
104 let mut buf_reader = BufReader::new(file);
105 read_mash_file(&mut buf_reader)
106 } else if filename.ends_with(FINCH_BIN_EXT) {
107 let mut buf_reader = BufReader::new(file);
108 read_finch_file(&mut buf_reader)
109 } else if filename.ends_with(FINCH_EXT) || filename.ends_with(".json") {
110 let mapped = unsafe { MmapOptions::new().map(&file)? };
111 let multisketch: MultiSketch =
112 serde_json::from_slice(&mapped).map_err(|_| format_err!("Error parsing {:?}", &p))?;
113 multisketch.to_sketches()
114 } else {
115 Err(format_err!("File suffix is not *.bsk, *.msh, or *.sk"))
116 }
117}