1use serde::{Deserialize, Serialize};
12use std::collections::HashSet;
13
14use crate::core::contig::SequenceRole;
15use crate::core::types::ReferenceSource;
16
17#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
19pub struct HierarchicalAssembly {
20 pub id: String,
22 pub name: String,
24 pub organism: String,
26 pub versions: Vec<AssemblyVersion>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
32pub struct AssemblyVersion {
33 pub id: String,
35 pub version: String,
37 pub source: ReportSource,
39 #[serde(default, skip_serializing_if = "Vec::is_empty")]
41 pub report_contigs: Vec<ReportContig>,
42 #[serde(default, skip_serializing_if = "Vec::is_empty")]
44 pub fasta_distributions: Vec<FastaDistribution>,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49#[serde(tag = "type", rename_all = "snake_case")]
50pub enum ReportSource {
51 Ncbi {
53 accession: String,
55 #[serde(default, skip_serializing_if = "Option::is_none")]
57 url: Option<String>,
58 #[serde(default, skip_serializing_if = "Option::is_none")]
60 date: Option<String>,
61 },
62 DerivedFromFasta {
64 source_files: Vec<String>,
66 #[serde(default, skip_serializing_if = "Option::is_none")]
68 base_assembly: Option<String>,
69 },
70 Manual {
72 #[serde(default, skip_serializing_if = "Option::is_none")]
73 curator: Option<String>,
74 #[serde(default, skip_serializing_if = "Option::is_none")]
75 notes: Option<String>,
76 },
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
81pub struct ReportContig {
82 pub id: u32,
84 pub sequence_name: String,
86 pub length: u64,
88 #[serde(default, skip_serializing_if = "Option::is_none")]
90 pub md5: Option<String>,
91 #[serde(default, skip_serializing_if = "Option::is_none")]
93 pub refseq_accn: Option<String>,
94 #[serde(default, skip_serializing_if = "Option::is_none")]
96 pub genbank_accn: Option<String>,
97 #[serde(default, skip_serializing_if = "Option::is_none")]
99 pub ucsc_name: Option<String>,
100 #[serde(default)]
102 pub sequence_role: SequenceRole,
103 #[serde(default, skip_serializing_if = "Option::is_none")]
105 pub assigned_molecule: Option<String>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub struct FastaDistribution {
111 pub id: String,
113 pub display_name: String,
115 pub source: ReferenceSource,
117 #[serde(default, skip_serializing_if = "Option::is_none")]
119 pub download_url: Option<String>,
120 #[serde(default, skip_serializing_if = "Vec::is_empty")]
122 pub tags: Vec<String>,
123 pub contigs: Vec<FastaContig>,
125}
126
127impl FastaDistribution {
128 #[must_use]
130 pub fn presence_counts(&self) -> PresenceCounts {
131 let mut counts = PresenceCounts::default();
132 for contig in &self.contigs {
133 if contig.report_contig_id.is_some() {
134 counts.in_both += 1;
135 } else {
136 counts.fasta_only += 1;
137 }
138 }
139 counts
140 }
141}
142
143#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
145pub struct FastaContig {
146 pub name: String,
148 pub length: u64,
150 pub md5: String,
152 pub sort_order: u32,
154 #[serde(default, skip_serializing_if = "Option::is_none")]
156 pub report_contig_id: Option<u32>,
157 #[serde(default, skip_serializing_if = "Vec::is_empty")]
159 pub aliases: Vec<String>,
160}
161
162impl FastaContig {
163 #[cfg(test)]
164 pub fn new(name: impl Into<String>, length: u64, md5: impl Into<String>) -> Self {
165 Self {
166 name: name.into(),
167 length,
168 md5: md5.into(),
169 sort_order: 0,
170 report_contig_id: None,
171 aliases: Vec::new(),
172 }
173 }
174
175 pub fn merge(&mut self, other: &FastaContig) -> Result<(), ContigMergeError> {
186 if self.name != other.name {
188 return Err(ContigMergeError::NameMismatch {
189 expected: self.name.clone(),
190 found: other.name.clone(),
191 });
192 }
193 if self.length != other.length {
194 return Err(ContigMergeError::LengthMismatch {
195 name: self.name.clone(),
196 expected: self.length,
197 found: other.length,
198 });
199 }
200
201 if !other.md5.is_empty() {
203 if self.md5.is_empty() {
204 self.md5.clone_from(&other.md5);
205 } else if self.md5 != other.md5 {
206 return Err(ContigMergeError::Md5Conflict {
207 name: self.name.clone(),
208 existing: self.md5.clone(),
209 incoming: other.md5.clone(),
210 });
211 }
212 }
213
214 let existing: HashSet<_> = self.aliases.iter().cloned().collect();
216 for alias in &other.aliases {
217 if !existing.contains(alias) && alias != &self.name {
218 self.aliases.push(alias.clone());
219 }
220 }
221
222 if self.report_contig_id.is_none() {
224 self.report_contig_id = other.report_contig_id;
225 }
226
227 Ok(())
228 }
229}
230
231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
233pub struct PresenceCounts {
234 pub in_both: usize,
236 pub report_only: usize,
238 pub fasta_only: usize,
240}
241
242#[derive(Debug, Clone, PartialEq)]
244pub enum ContigMergeError {
245 NameMismatch {
246 expected: String,
247 found: String,
248 },
249 LengthMismatch {
250 name: String,
251 expected: u64,
252 found: u64,
253 },
254 Md5Conflict {
255 name: String,
256 existing: String,
257 incoming: String,
258 },
259}
260
261impl std::fmt::Display for ContigMergeError {
262 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263 match self {
264 Self::NameMismatch { expected, found } => {
265 write!(
266 f,
267 "Contig name mismatch: expected '{expected}', found '{found}'"
268 )
269 }
270 Self::LengthMismatch {
271 name,
272 expected,
273 found,
274 } => {
275 write!(
276 f,
277 "Length mismatch for '{name}': expected {expected}, found {found}"
278 )
279 }
280 Self::Md5Conflict {
281 name,
282 existing,
283 incoming,
284 } => {
285 write!(
286 f,
287 "MD5 conflict for '{name}': existing={existing}, incoming={incoming}"
288 )
289 }
290 }
291 }
292}
293
294impl std::error::Error for ContigMergeError {}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299
300 fn make_contig(name: &str, length: u64, md5: &str, aliases: Vec<&str>) -> FastaContig {
301 FastaContig {
302 name: name.to_string(),
303 length,
304 md5: md5.to_string(),
305 sort_order: 0,
306 report_contig_id: None,
307 aliases: aliases.into_iter().map(String::from).collect(),
308 }
309 }
310
311 #[test]
312 fn test_merge_adds_md5_when_missing() {
313 let mut base = make_contig("chr1", 1000, "", vec![]);
314 let other = make_contig("chr1", 1000, "abc123", vec![]);
315
316 base.merge(&other).unwrap();
317
318 assert_eq!(base.md5, "abc123");
319 }
320
321 #[test]
322 fn test_merge_keeps_existing_md5() {
323 let mut base = make_contig("chr1", 1000, "abc123", vec![]);
324 let other = make_contig("chr1", 1000, "", vec![]);
325
326 base.merge(&other).unwrap();
327
328 assert_eq!(base.md5, "abc123");
329 }
330
331 #[test]
332 fn test_merge_md5_conflict_errors() {
333 let mut base = make_contig("chr1", 1000, "abc123", vec![]);
334 let other = make_contig("chr1", 1000, "def456", vec![]);
335
336 let result = base.merge(&other);
337
338 assert!(matches!(result, Err(ContigMergeError::Md5Conflict { .. })));
339 }
340
341 #[test]
342 fn test_merge_md5_same_value_ok() {
343 let mut base = make_contig("chr1", 1000, "abc123", vec![]);
344 let other = make_contig("chr1", 1000, "abc123", vec![]);
345
346 base.merge(&other).unwrap();
347
348 assert_eq!(base.md5, "abc123");
349 }
350
351 #[test]
352 fn test_merge_unions_aliases() {
353 let mut base = make_contig("chr1", 1000, "", vec!["1", "NC_000001"]);
354 let other = make_contig("chr1", 1000, "", vec!["NC_000001", "CM000663"]);
355
356 base.merge(&other).unwrap();
357
358 assert_eq!(base.aliases.len(), 3);
359 assert!(base.aliases.contains(&"1".to_string()));
360 assert!(base.aliases.contains(&"NC_000001".to_string()));
361 assert!(base.aliases.contains(&"CM000663".to_string()));
362 }
363
364 #[test]
365 fn test_merge_excludes_name_from_aliases() {
366 let mut base = make_contig("chr1", 1000, "", vec![]);
367 let other = make_contig("chr1", 1000, "", vec!["chr1", "1"]);
368
369 base.merge(&other).unwrap();
370
371 assert_eq!(base.aliases, vec!["1"]);
372 assert!(!base.aliases.contains(&"chr1".to_string()));
373 }
374
375 #[test]
376 fn test_merge_length_mismatch_errors() {
377 let mut base = make_contig("chr1", 1000, "", vec![]);
378 let other = make_contig("chr1", 2000, "", vec![]);
379
380 let result = base.merge(&other);
381
382 assert!(matches!(
383 result,
384 Err(ContigMergeError::LengthMismatch { .. })
385 ));
386 }
387
388 #[test]
389 fn test_merge_name_mismatch_errors() {
390 let mut base = make_contig("chr1", 1000, "", vec![]);
391 let other = make_contig("chr2", 1000, "", vec![]);
392
393 let result = base.merge(&other);
394
395 assert!(matches!(result, Err(ContigMergeError::NameMismatch { .. })));
396 }
397
398 #[test]
399 fn test_merge_takes_first_report_contig_id() {
400 let mut base = make_contig("chr1", 1000, "", vec![]);
401 base.report_contig_id = Some(42);
402
403 let mut other = make_contig("chr1", 1000, "", vec![]);
404 other.report_contig_id = Some(99);
405
406 base.merge(&other).unwrap();
407
408 assert_eq!(base.report_contig_id, Some(42));
409 }
410
411 #[test]
412 fn test_merge_fills_missing_report_contig_id() {
413 let mut base = make_contig("chr1", 1000, "", vec![]);
414
415 let mut other = make_contig("chr1", 1000, "", vec![]);
416 other.report_contig_id = Some(42);
417
418 base.merge(&other).unwrap();
419
420 assert_eq!(base.report_contig_id, Some(42));
421 }
422}