finch/
lib.rs

1use std::fs::File;
2use std::io::{stdin, BufReader, Read};
3use std::path::Path;
4
5use memmap::MmapOptions;
6use needletail::parse_fastx_reader;
7use needletail::parser::Format;
8use rayon::prelude::*;
9
10use crate::filtering::FilterParams;
11use crate::serialization::{
12    read_finch_file, read_mash_file, MultiSketch, Sketch, FINCH_BIN_EXT, FINCH_EXT, MASH_EXT,
13};
14use crate::sketch_schemes::SketchParams;
15
16pub mod distance;
17pub mod filtering;
18pub mod sketch_schemes;
19// it would be nice if there was a `pub(in main)` or something for
20// main_parsing so we don't import it for `lib` itself
21pub mod errors;
22#[cfg(feature = "python")]
23pub mod python;
24pub mod serialization;
25pub mod statistics;
26
27use crate::errors::FinchResult;
28
29pub fn sketch_files(
30    filenames: &[&str],
31    sketch_params: &SketchParams,
32    filters: &FilterParams,
33) -> FinchResult<Vec<Sketch>> {
34    let sketches: FinchResult<Vec<Sketch>> = filenames
35        .par_iter()
36        .map(|filename| {
37            // open the file with a special case to handle stdin
38            let reader: Box<dyn Read + Send> = if filename == &"-" {
39                // We're not locking it so technically not thread safe
40                Box::new(stdin())
41            } else {
42                Box::new(File::open(&Path::new(filename))?)
43            };
44            // sketch!
45            Ok(sketch_stream(reader, filename, sketch_params, &filters)?)
46        })
47        .collect();
48    sketches
49}
50
51pub fn sketch_stream<'a>(
52    reader: Box<dyn Read + Send + 'a>,
53    name: &str,
54    sketch_params: &SketchParams,
55    filters: &FilterParams,
56) -> FinchResult<Sketch> {
57    let mut filter_params = filters.clone();
58    let mut sketcher = sketch_params.create_sketcher();
59    // TODO: remove expects after removing failure
60    let mut fastx_reader = parse_fastx_reader(reader).expect("valid file TODO");
61    let mut seq_type = None;
62    while let Some(record) = fastx_reader.next() {
63        let seqrec = record.expect("invalid record");
64        if seq_type.is_none() {
65            seq_type = Some(seqrec.format());
66        }
67        sketcher.process(seqrec);
68    }
69
70    // disable filtering for FASTA files unless it was explicitly specified
71    if filter_params.filter_on.is_none() {
72        filter_params.filter_on = match seq_type.expect("Should have got a type") {
73            Format::Fasta => Some(false),
74            Format::Fastq => Some(true),
75        };
76    }
77
78    let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers();
79    let hashes = sketcher.to_vec();
80
81    // do filtering
82    let mut filtered_hashes = filter_params.filter_counts(&hashes);
83    sketch_params.process_post_filter(&mut filtered_hashes, name)?;
84
85    Ok(Sketch {
86        name: name.to_string(),
87        seq_length,
88        num_valid_kmers,
89        comment: "".to_string(),
90        hashes: filtered_hashes,
91        filter_params,
92        sketch_params: sketch_params.clone(),
93    })
94}
95
96pub fn open_sketch_file<P: AsRef<Path>>(path: P) -> FinchResult<Vec<Sketch>> {
97    let p = path.as_ref();
98    let filename = p
99        .file_name()
100        .ok_or_else(|| format_err!("Path does not have a filename: {:?}", p))?
101        .to_string_lossy();
102    let file = File::open(p).map_err(|_| format_err!("Error opening {:?}", p))?;
103    if filename.ends_with(MASH_EXT) {
104        let mut buf_reader = BufReader::new(file);
105        read_mash_file(&mut buf_reader)
106    } else if filename.ends_with(FINCH_BIN_EXT) {
107        let mut buf_reader = BufReader::new(file);
108        read_finch_file(&mut buf_reader)
109    } else if filename.ends_with(FINCH_EXT) || filename.ends_with(".json") {
110        let mapped = unsafe { MmapOptions::new().map(&file)? };
111        let multisketch: MultiSketch =
112            serde_json::from_slice(&mapped).map_err(|_| format_err!("Error parsing {:?}", &p))?;
113        multisketch.to_sketches()
114    } else {
115        Err(format_err!("File suffix is not *.bsk, *.msh, or *.sk"))
116    }
117}