finch/serialization/
mod.rs

1// Those 2 files are generated so we just ignore everything in terms
2#[allow(clippy::all)]
3#[allow(dead_code)]
4#[cfg_attr(rustfmt, rustfmt_skip)]
5mod finch_capnp;
6#[allow(clippy::all)]
7#[allow(dead_code)]
8#[cfg_attr(rustfmt, rustfmt_skip)]
9mod mash_capnp;
10
11mod json;
12mod mash;
13
14use std::io::{BufRead, Write};
15
16use capnp::message;
17use capnp::serialize as capnp_serialize;
18use serde::{Deserialize, Serialize};
19
20use crate::errors::FinchResult;
21use crate::filtering::FilterParams;
22use crate::serialization::finch_capnp::{multisketch, sketch_params, SketchMethod};
23pub use crate::serialization::json::{JsonSketch, MultiSketch};
24pub use crate::serialization::mash::{read_mash_file, write_mash_file};
25use crate::sketch_schemes::{KmerCount, SketchParams};
26
27pub const FINCH_EXT: &str = ".sk";
28pub const FINCH_BIN_EXT: &str = ".bsk";
29pub const MASH_EXT: &str = ".msh";
30
31#[derive(Debug, Serialize, Deserialize)]
32pub struct SketchDistance {
33    pub containment: f64,
34    pub jaccard: f64,
35    #[serde(rename = "mashDistance")]
36    pub mash_distance: f64,
37    #[serde(rename = "commonHashes")]
38    pub common_hashes: u64,
39    #[serde(rename = "totalHashes")]
40    pub total_hashes: u64,
41    pub query: String,
42    pub reference: String,
43}
44
45#[derive(Clone, Debug, PartialEq)]
46pub struct Sketch {
47    pub name: String,
48    pub seq_length: u64,
49    pub num_valid_kmers: u64,
50    pub comment: String,
51
52    pub hashes: Vec<KmerCount>,
53    pub filter_params: FilterParams,
54    pub sketch_params: SketchParams,
55}
56
57impl Sketch {
58    pub fn len(&self) -> usize {
59        self.hashes.len()
60    }
61
62    pub fn is_empty(&self) -> bool {
63        self.hashes.is_empty()
64    }
65}
66
67impl Into<JsonSketch> for Sketch {
68    fn into(self) -> JsonSketch {
69        JsonSketch::new(
70            &self.name,
71            self.seq_length,
72            self.num_valid_kmers,
73            self.hashes,
74            &self.filter_params.to_serialized(),
75        )
76    }
77}
78
79fn set_sketch_params(mut cap_sketch_params: sketch_params::Builder, sketch_params: &SketchParams) {
80    match *sketch_params {
81        SketchParams::Mash {
82            kmers_to_sketch,
83            final_size,
84            no_strict,
85            kmer_length,
86            hash_seed,
87        } => {
88            cap_sketch_params.set_sketch_method(SketchMethod::MurmurHash3);
89            cap_sketch_params.set_kmer_length(kmer_length);
90            cap_sketch_params.set_kmers_to_sketch(kmers_to_sketch as u64);
91            cap_sketch_params.set_hash_seed(hash_seed);
92            cap_sketch_params.set_final_size(final_size as u64);
93            cap_sketch_params.set_no_strict(no_strict);
94        }
95        SketchParams::Scaled {
96            kmers_to_sketch,
97            kmer_length,
98            scale,
99            hash_seed,
100        } => {
101            cap_sketch_params.set_sketch_method(SketchMethod::MurmurHash3Scaled);
102            cap_sketch_params.set_kmer_length(kmer_length);
103            cap_sketch_params.set_kmers_to_sketch(kmers_to_sketch as u64);
104            cap_sketch_params.set_hash_seed(hash_seed);
105            cap_sketch_params.set_scale(scale);
106        }
107        SketchParams::AllCounts { kmer_length } => {
108            cap_sketch_params.set_sketch_method(SketchMethod::None);
109            cap_sketch_params.set_kmer_length(kmer_length);
110        }
111    }
112}
113
114fn get_sketch_params(cap_sketch_params: sketch_params::Reader) -> FinchResult<SketchParams> {
115    Ok(match cap_sketch_params.get_sketch_method()? {
116        SketchMethod::MurmurHash3 => SketchParams::Mash {
117            kmers_to_sketch: cap_sketch_params.get_kmers_to_sketch() as usize,
118            final_size: cap_sketch_params.get_final_size() as usize,
119            no_strict: cap_sketch_params.get_no_strict(),
120            kmer_length: cap_sketch_params.get_kmer_length(),
121            hash_seed: cap_sketch_params.get_hash_seed(),
122        },
123        SketchMethod::MurmurHash3Scaled => SketchParams::Scaled {
124            kmers_to_sketch: cap_sketch_params.get_kmers_to_sketch() as usize,
125            kmer_length: cap_sketch_params.get_kmer_length(),
126            scale: cap_sketch_params.get_scale(),
127            hash_seed: cap_sketch_params.get_hash_seed(),
128        },
129        SketchMethod::None => SketchParams::AllCounts {
130            kmer_length: cap_sketch_params.get_kmer_length(),
131        },
132    })
133}
134
135pub fn write_finch_file(mut file: &mut dyn Write, sketches: &[Sketch]) -> FinchResult<()> {
136    let mut message = message::Builder::new_default();
137    let finch_file: multisketch::Builder = message.init_root::<multisketch::Builder>();
138
139    let mut cap_sketches = finch_file.init_sketches(sketches.len() as u32);
140    for (i, sketch) in sketches.iter().enumerate() {
141        let mut cap_sketch = cap_sketches.reborrow().get(i as u32);
142        cap_sketch.set_name(&sketch.name);
143        cap_sketch.set_seq_length(sketch.seq_length);
144        cap_sketch.set_num_valid_kmers(sketch.num_valid_kmers);
145        cap_sketch.set_comment(&sketch.comment);
146
147        // TODO: we should probably error if hashes.len() > 2**32?
148        // (and handle these `as u32`s a little better in general
149        let mut hashes = cap_sketch
150            .reborrow()
151            .init_hashes(sketch.hashes.len() as u32);
152        for (j, hash) in sketch.hashes.iter().enumerate() {
153            let mut cap_hash = hashes.reborrow().get(j as u32);
154            cap_hash.set_hash(hash.hash);
155            cap_hash.set_kmer(&hash.kmer);
156            cap_hash.set_count(hash.count);
157            cap_hash.set_extra_count(hash.extra_count);
158            if let Some(label) = &hash.label {
159                cap_hash.set_label(label);
160            }
161        }
162
163        let mut cap_filter_params = cap_sketch.reborrow().init_filter_params();
164        cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false));
165        cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0));
166        cap_filter_params.set_high_abun_filter(
167            sketch
168                .filter_params
169                .abun_filter
170                .1
171                .unwrap_or(::std::u32::MAX),
172        );
173        cap_filter_params.set_err_filter(sketch.filter_params.err_filter);
174        cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter);
175
176        let sketch_params = &sketch.sketch_params;
177        let cap_sketch_params = cap_sketch.reborrow().init_sketch_params();
178        set_sketch_params(cap_sketch_params, &sketch_params);
179    }
180
181    capnp_serialize::write_message(&mut file, &message)?;
182    Ok(())
183}
184
185pub fn read_finch_file(mut file: &mut dyn BufRead) -> FinchResult<Vec<Sketch>> {
186    let options = *message::ReaderOptions::new().traversal_limit_in_words(Some(1024 * 1024 * 1024));
187    let reader = capnp_serialize::read_message(&mut file, options)?;
188    let cap_data: multisketch::Reader = reader.get_root::<multisketch::Reader>()?;
189    let cap_sketches = cap_data.get_sketches()?;
190
191    let mut sketches = Vec::with_capacity(cap_sketches.len() as usize);
192    for cap_sketch in cap_sketches {
193        let cap_hashes = cap_sketch.get_hashes()?;
194        let mut hashes = Vec::with_capacity(cap_hashes.len() as usize);
195        for cap_hash in cap_hashes {
196            let label = if cap_hash.has_label() {
197                Some(cap_hash.get_label()?.to_vec())
198            } else {
199                None
200            };
201            hashes.push(KmerCount {
202                hash: cap_hash.get_hash(),
203                kmer: cap_hash.get_kmer()?.to_vec(),
204                count: cap_hash.get_count(),
205                extra_count: cap_hash.get_extra_count(),
206                label,
207            });
208        }
209
210        let cap_sketch_params = cap_sketch.get_sketch_params()?;
211        let sketch_params = get_sketch_params(cap_sketch_params)?;
212
213        let cap_filter_params = cap_sketch.get_filter_params()?;
214        let low_abun_filter = match cap_filter_params.get_low_abun_filter() {
215            0 => None,
216            i => Some(i),
217        };
218        let high_abun_filter = match cap_filter_params.get_high_abun_filter() {
219            ::std::u32::MAX => None,
220            i => Some(i),
221        };
222        let filter_params = FilterParams {
223            filter_on: Some(cap_filter_params.get_filtered()),
224            abun_filter: (low_abun_filter, high_abun_filter),
225            err_filter: cap_filter_params.get_err_filter(),
226            strand_filter: cap_filter_params.get_strand_filter(),
227        };
228
229        sketches.push(Sketch {
230            name: cap_sketch.get_name()?.to_string(),
231            seq_length: cap_sketch.get_seq_length(),
232            num_valid_kmers: cap_sketch.get_num_valid_kmers(),
233            comment: cap_sketch.get_comment()?.to_string(),
234
235            hashes,
236            sketch_params,
237            filter_params,
238        });
239    }
240    Ok(sketches)
241}