finch/serialization/
mod.rs1#[allow(clippy::all)]
3#[allow(dead_code)]
4#[cfg_attr(rustfmt, rustfmt_skip)]
5mod finch_capnp;
6#[allow(clippy::all)]
7#[allow(dead_code)]
8#[cfg_attr(rustfmt, rustfmt_skip)]
9mod mash_capnp;
10
11mod json;
12mod mash;
13
14use std::io::{BufRead, Write};
15
16use capnp::message;
17use capnp::serialize as capnp_serialize;
18use serde::{Deserialize, Serialize};
19
20use crate::errors::FinchResult;
21use crate::filtering::FilterParams;
22use crate::serialization::finch_capnp::{multisketch, sketch_params, SketchMethod};
23pub use crate::serialization::json::{JsonSketch, MultiSketch};
24pub use crate::serialization::mash::{read_mash_file, write_mash_file};
25use crate::sketch_schemes::{KmerCount, SketchParams};
26
27pub const FINCH_EXT: &str = ".sk";
28pub const FINCH_BIN_EXT: &str = ".bsk";
29pub const MASH_EXT: &str = ".msh";
30
31#[derive(Debug, Serialize, Deserialize)]
32pub struct SketchDistance {
33 pub containment: f64,
34 pub jaccard: f64,
35 #[serde(rename = "mashDistance")]
36 pub mash_distance: f64,
37 #[serde(rename = "commonHashes")]
38 pub common_hashes: u64,
39 #[serde(rename = "totalHashes")]
40 pub total_hashes: u64,
41 pub query: String,
42 pub reference: String,
43}
44
45#[derive(Clone, Debug, PartialEq)]
46pub struct Sketch {
47 pub name: String,
48 pub seq_length: u64,
49 pub num_valid_kmers: u64,
50 pub comment: String,
51
52 pub hashes: Vec<KmerCount>,
53 pub filter_params: FilterParams,
54 pub sketch_params: SketchParams,
55}
56
57impl Sketch {
58 pub fn len(&self) -> usize {
59 self.hashes.len()
60 }
61
62 pub fn is_empty(&self) -> bool {
63 self.hashes.is_empty()
64 }
65}
66
67impl Into<JsonSketch> for Sketch {
68 fn into(self) -> JsonSketch {
69 JsonSketch::new(
70 &self.name,
71 self.seq_length,
72 self.num_valid_kmers,
73 self.hashes,
74 &self.filter_params.to_serialized(),
75 )
76 }
77}
78
79fn set_sketch_params(mut cap_sketch_params: sketch_params::Builder, sketch_params: &SketchParams) {
80 match *sketch_params {
81 SketchParams::Mash {
82 kmers_to_sketch,
83 final_size,
84 no_strict,
85 kmer_length,
86 hash_seed,
87 } => {
88 cap_sketch_params.set_sketch_method(SketchMethod::MurmurHash3);
89 cap_sketch_params.set_kmer_length(kmer_length);
90 cap_sketch_params.set_kmers_to_sketch(kmers_to_sketch as u64);
91 cap_sketch_params.set_hash_seed(hash_seed);
92 cap_sketch_params.set_final_size(final_size as u64);
93 cap_sketch_params.set_no_strict(no_strict);
94 }
95 SketchParams::Scaled {
96 kmers_to_sketch,
97 kmer_length,
98 scale,
99 hash_seed,
100 } => {
101 cap_sketch_params.set_sketch_method(SketchMethod::MurmurHash3Scaled);
102 cap_sketch_params.set_kmer_length(kmer_length);
103 cap_sketch_params.set_kmers_to_sketch(kmers_to_sketch as u64);
104 cap_sketch_params.set_hash_seed(hash_seed);
105 cap_sketch_params.set_scale(scale);
106 }
107 SketchParams::AllCounts { kmer_length } => {
108 cap_sketch_params.set_sketch_method(SketchMethod::None);
109 cap_sketch_params.set_kmer_length(kmer_length);
110 }
111 }
112}
113
114fn get_sketch_params(cap_sketch_params: sketch_params::Reader) -> FinchResult<SketchParams> {
115 Ok(match cap_sketch_params.get_sketch_method()? {
116 SketchMethod::MurmurHash3 => SketchParams::Mash {
117 kmers_to_sketch: cap_sketch_params.get_kmers_to_sketch() as usize,
118 final_size: cap_sketch_params.get_final_size() as usize,
119 no_strict: cap_sketch_params.get_no_strict(),
120 kmer_length: cap_sketch_params.get_kmer_length(),
121 hash_seed: cap_sketch_params.get_hash_seed(),
122 },
123 SketchMethod::MurmurHash3Scaled => SketchParams::Scaled {
124 kmers_to_sketch: cap_sketch_params.get_kmers_to_sketch() as usize,
125 kmer_length: cap_sketch_params.get_kmer_length(),
126 scale: cap_sketch_params.get_scale(),
127 hash_seed: cap_sketch_params.get_hash_seed(),
128 },
129 SketchMethod::None => SketchParams::AllCounts {
130 kmer_length: cap_sketch_params.get_kmer_length(),
131 },
132 })
133}
134
135pub fn write_finch_file(mut file: &mut dyn Write, sketches: &[Sketch]) -> FinchResult<()> {
136 let mut message = message::Builder::new_default();
137 let finch_file: multisketch::Builder = message.init_root::<multisketch::Builder>();
138
139 let mut cap_sketches = finch_file.init_sketches(sketches.len() as u32);
140 for (i, sketch) in sketches.iter().enumerate() {
141 let mut cap_sketch = cap_sketches.reborrow().get(i as u32);
142 cap_sketch.set_name(&sketch.name);
143 cap_sketch.set_seq_length(sketch.seq_length);
144 cap_sketch.set_num_valid_kmers(sketch.num_valid_kmers);
145 cap_sketch.set_comment(&sketch.comment);
146
147 let mut hashes = cap_sketch
150 .reborrow()
151 .init_hashes(sketch.hashes.len() as u32);
152 for (j, hash) in sketch.hashes.iter().enumerate() {
153 let mut cap_hash = hashes.reborrow().get(j as u32);
154 cap_hash.set_hash(hash.hash);
155 cap_hash.set_kmer(&hash.kmer);
156 cap_hash.set_count(hash.count);
157 cap_hash.set_extra_count(hash.extra_count);
158 if let Some(label) = &hash.label {
159 cap_hash.set_label(label);
160 }
161 }
162
163 let mut cap_filter_params = cap_sketch.reborrow().init_filter_params();
164 cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false));
165 cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0));
166 cap_filter_params.set_high_abun_filter(
167 sketch
168 .filter_params
169 .abun_filter
170 .1
171 .unwrap_or(::std::u32::MAX),
172 );
173 cap_filter_params.set_err_filter(sketch.filter_params.err_filter);
174 cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter);
175
176 let sketch_params = &sketch.sketch_params;
177 let cap_sketch_params = cap_sketch.reborrow().init_sketch_params();
178 set_sketch_params(cap_sketch_params, &sketch_params);
179 }
180
181 capnp_serialize::write_message(&mut file, &message)?;
182 Ok(())
183}
184
185pub fn read_finch_file(mut file: &mut dyn BufRead) -> FinchResult<Vec<Sketch>> {
186 let options = *message::ReaderOptions::new().traversal_limit_in_words(Some(1024 * 1024 * 1024));
187 let reader = capnp_serialize::read_message(&mut file, options)?;
188 let cap_data: multisketch::Reader = reader.get_root::<multisketch::Reader>()?;
189 let cap_sketches = cap_data.get_sketches()?;
190
191 let mut sketches = Vec::with_capacity(cap_sketches.len() as usize);
192 for cap_sketch in cap_sketches {
193 let cap_hashes = cap_sketch.get_hashes()?;
194 let mut hashes = Vec::with_capacity(cap_hashes.len() as usize);
195 for cap_hash in cap_hashes {
196 let label = if cap_hash.has_label() {
197 Some(cap_hash.get_label()?.to_vec())
198 } else {
199 None
200 };
201 hashes.push(KmerCount {
202 hash: cap_hash.get_hash(),
203 kmer: cap_hash.get_kmer()?.to_vec(),
204 count: cap_hash.get_count(),
205 extra_count: cap_hash.get_extra_count(),
206 label,
207 });
208 }
209
210 let cap_sketch_params = cap_sketch.get_sketch_params()?;
211 let sketch_params = get_sketch_params(cap_sketch_params)?;
212
213 let cap_filter_params = cap_sketch.get_filter_params()?;
214 let low_abun_filter = match cap_filter_params.get_low_abun_filter() {
215 0 => None,
216 i => Some(i),
217 };
218 let high_abun_filter = match cap_filter_params.get_high_abun_filter() {
219 ::std::u32::MAX => None,
220 i => Some(i),
221 };
222 let filter_params = FilterParams {
223 filter_on: Some(cap_filter_params.get_filtered()),
224 abun_filter: (low_abun_filter, high_abun_filter),
225 err_filter: cap_filter_params.get_err_filter(),
226 strand_filter: cap_filter_params.get_strand_filter(),
227 };
228
229 sketches.push(Sketch {
230 name: cap_sketch.get_name()?.to_string(),
231 seq_length: cap_sketch.get_seq_length(),
232 num_valid_kmers: cap_sketch.get_num_valid_kmers(),
233 comment: cap_sketch.get_comment()?.to_string(),
234
235 hashes,
236 sketch_params,
237 filter_params,
238 });
239 }
240 Ok(sketches)
241}