finch/serialization/
json.rs

1use std::collections::HashMap;
2use std::fmt;
3use std::mem;
4
5use serde::de::{self, Deserializer, Visitor};
6use serde::ser::{SerializeStruct, Serializer};
7use serde::{Deserialize, Serialize};
8
9use crate::bail;
10use crate::errors::FinchResult;
11use crate::filtering::FilterParams;
12pub use crate::serialization::mash::{read_mash_file, write_mash_file};
13use crate::serialization::Sketch;
14use crate::sketch_schemes::{KmerCount, SketchParams};
15
16#[derive(Clone, Debug, Eq, PartialEq)]
17pub struct JsonSketch {
18    pub name: String,
19    pub seq_length: Option<u64>,
20    pub num_valid_kmers: Option<u64>,
21    pub comment: Option<String>,
22    pub filters: Option<HashMap<String, String>>,
23    pub hashes: Vec<KmerCount>,
24}
25
26impl JsonSketch {
27    pub fn new(
28        name: &str,
29        length: u64,
30        n_kmers: u64,
31        kmercounts: Vec<KmerCount>,
32        filters: &HashMap<String, String>,
33    ) -> Self {
34        JsonSketch {
35            name: String::from(name),
36            seq_length: Some(length),
37            num_valid_kmers: Some(n_kmers),
38            comment: Some(String::from("")),
39            filters: Some(filters.clone()),
40            hashes: kmercounts,
41        }
42    }
43
44    pub fn len(&self) -> usize {
45        self.hashes.len()
46    }
47
48    pub fn is_empty(&self) -> bool {
49        self.hashes.is_empty()
50    }
51}
52
53impl Serialize for JsonSketch {
54    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
55    where
56        S: Serializer,
57    {
58        let mut hash_list = Vec::with_capacity(self.hashes.len());
59        let mut kmer_list = Vec::with_capacity(self.hashes.len());
60        let mut count_list = Vec::with_capacity(self.hashes.len());
61        for hash in &self.hashes {
62            hash_list.push(hash.hash.to_string());
63            kmer_list.push(String::from_utf8(hash.kmer.clone()).unwrap());
64            count_list.push(hash.count);
65        }
66
67        let mut state = serializer.serialize_struct("Sketch", 8)?;
68        state.serialize_field("name", &self.name)?;
69        state.serialize_field("seqLength", &self.seq_length)?;
70        state.serialize_field("numValidKmers", &self.num_valid_kmers)?;
71        state.serialize_field("comment", &self.comment)?;
72        state.serialize_field("filters", &self.filters)?;
73        state.serialize_field("hashes", &hash_list)?;
74        state.serialize_field("kmers", &kmer_list)?;
75        state.serialize_field("counts", &count_list)?;
76        state.end()
77    }
78}
79
80impl<'de> Deserialize<'de> for JsonSketch {
81    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
82    where
83        D: Deserializer<'de>,
84    {
85        #[allow(non_snake_case)]
86        #[derive(Deserialize)]
87        struct BaseJsonSketch {
88            pub name: String,
89            pub seqLength: Option<u64>,
90            pub numValidKmers: Option<u64>,
91            pub comment: Option<String>,
92            pub filters: Option<HashMap<String, String>>,
93            hashes: Vec<QuotedU64>,
94            kmers: Option<Vec<String>>,
95            counts: Option<Vec<u32>>,
96        }
97
98        let mut jsketch = BaseJsonSketch::deserialize(deserializer)?;
99
100        let mut kmercount_list = Vec::with_capacity(jsketch.hashes.len());
101        for i in 0..jsketch.hashes.len() {
102            let hash = jsketch.hashes[i].0;
103            let kmer = match &mut jsketch.kmers {
104                Some(v) => mem::replace(&mut v[i], String::new()).into_bytes(),
105                None => Vec::new(),
106            };
107            let count = match &jsketch.counts {
108                Some(v) => v[i],
109                None => 1,
110            };
111            kmercount_list.push(KmerCount {
112                hash,
113                kmer,
114                count,
115                extra_count: count / 2,
116                label: None,
117            });
118        }
119        Ok(JsonSketch {
120            name: jsketch.name,
121            seq_length: jsketch.seqLength,
122            num_valid_kmers: jsketch.numValidKmers,
123            comment: jsketch.comment,
124            filters: jsketch.filters,
125            hashes: kmercount_list,
126        })
127    }
128}
129
130#[derive(Debug, Serialize, Deserialize)]
131pub struct MultiSketch {
132    pub kmer: u8,
133    pub alphabet: String,
134    #[serde(rename = "preserveCase")]
135    pub preserve_case: bool,
136    pub canonical: bool,
137    #[serde(rename = "sketchSize")]
138    pub sketch_size: u32,
139    #[serde(rename = "hashType")]
140    pub hash_type: String,
141    #[serde(rename = "hashBits")]
142    pub hash_bits: u16,
143    #[serde(rename = "hashSeed")]
144    pub hash_seed: u64,
145    pub scale: Option<f64>,
146    pub sketches: Vec<JsonSketch>,
147}
148
149impl MultiSketch {
150    pub fn get_params(&self) -> FinchResult<SketchParams> {
151        Ok(match (&*self.hash_type, self.scale) {
152            ("MurmurHash3_x64_128", None) => {
153                if self.hash_bits != 64 {
154                    bail!(
155                        "Multisketch has incompatible hash size ({} != 64)",
156                        self.hash_bits
157                    );
158                }
159                SketchParams::Mash {
160                    kmers_to_sketch: self.sketch_size as usize,
161                    final_size: self.sketch_size as usize,
162                    no_strict: true,
163                    kmer_length: self.kmer,
164                    hash_seed: self.hash_seed,
165                }
166            }
167            ("MurmurHash3_x64_128", Some(scale)) => {
168                if self.hash_bits != 64 {
169                    bail!(
170                        "Multisketch has incompatible hash size ({} != 64)",
171                        self.hash_bits
172                    );
173                }
174                SketchParams::Scaled {
175                    kmers_to_sketch: self.sketch_size as usize,
176                    kmer_length: self.kmer,
177                    scale,
178                    hash_seed: self.hash_seed,
179                }
180            }
181            ("None", _) => SketchParams::AllCounts {
182                kmer_length: self.kmer,
183            },
184            (x, _) => bail!("{} sketch type is not supported", x),
185        })
186    }
187
188    pub fn from_sketches(sketches: &[Sketch]) -> FinchResult<Self> {
189        let json_sketches: Vec<JsonSketch> = sketches.iter().map(|x| (*x).clone().into()).collect();
190        let sketch_params = SketchParams::from_sketches(&sketches)?;
191        // TODO: the scale isn't actually harmonized between the sketches at
192        // this point; it probably should be?
193        let (hash_type, hash_bits, hash_seed, scale) = sketch_params.hash_info();
194        Ok(MultiSketch {
195            alphabet: "ACGT".to_string(),
196            preserve_case: false,
197            canonical: true,
198
199            sketch_size: sketch_params.expected_size() as u32,
200            kmer: sketch_params.k(),
201            hash_type: hash_type.to_string(),
202            hash_bits,
203            hash_seed,
204            scale,
205            sketches: json_sketches,
206        })
207    }
208
209    pub fn to_sketches(&self) -> FinchResult<Vec<Sketch>> {
210        let empty_hashmap = HashMap::new();
211        let mut sketches = Vec::with_capacity(self.sketches.len());
212        let sketch_params = self.get_params()?;
213        for sketch in &self.sketches {
214            let filters = sketch.filters.as_ref().unwrap_or(&empty_hashmap);
215            let filter_params = FilterParams::from_serialized(filters)?;
216            sketches.push(Sketch {
217                name: sketch.name.clone(),
218                seq_length: sketch.seq_length.unwrap_or(0),
219                num_valid_kmers: sketch.num_valid_kmers.unwrap_or(0),
220                comment: sketch.comment.clone().unwrap_or_else(|| "".to_string()),
221                hashes: sketch.hashes.clone(),
222                filter_params,
223                sketch_params: sketch_params.clone(),
224            });
225        }
226        Ok(sketches)
227    }
228}
229
230struct QuotedU64(u64);
231
232impl<'de> Deserialize<'de> for QuotedU64 {
233    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
234    where
235        D: Deserializer<'de>,
236    {
237        struct QuotedU64Visitor;
238
239        impl<'de> Visitor<'de> for QuotedU64Visitor {
240            type Value = QuotedU64;
241
242            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
243                formatter.write_str("usize as a json string")
244            }
245
246            fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
247            where
248                E: de::Error,
249            {
250                value.parse().map(QuotedU64).map_err(de::Error::custom)
251            }
252        }
253
254        deserializer.deserialize_str(QuotedU64Visitor)
255    }
256}