1use std::collections::HashMap;
41
42use crate::core::contig::{Contig, SequenceRole};
43use crate::parsing::sam::ParseError;
44use crate::utils::validation::check_contig_limit;
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum PatchType {
49 Fix,
51 Novel,
53}
54
55#[derive(Debug, Clone)]
57pub struct NcbiContigEntry {
58 pub sequence_name: String,
60 pub length: u64,
62 pub genbank_accn: Option<String>,
64 pub refseq_accn: Option<String>,
66 pub ucsc_name: Option<String>,
68 pub role: Option<String>,
70 pub assigned_molecule: Option<String>,
72 pub patch_type: Option<PatchType>,
74}
75
76#[must_use]
110pub fn generate_ucsc_patch_name(
111 genbank_accession: &str,
112 chromosome: &str,
113 patch_type: PatchType,
114) -> Option<String> {
115 let parts: Vec<&str> = genbank_accession.split('.').collect();
117 if parts.len() != 2 {
118 return None;
119 }
120
121 let accession_base = parts[0];
122 let version = parts[1];
123
124 if !version.chars().all(|c| c.is_ascii_digit()) {
126 return None;
127 }
128
129 if !accession_base
131 .chars()
132 .all(|c| c.is_ascii_alphanumeric() || c == '_')
133 {
134 return None;
135 }
136
137 let suffix = match patch_type {
139 PatchType::Fix => "fix",
140 PatchType::Novel => "alt",
141 };
142
143 Some(format!(
145 "chr{chromosome}_{accession_base}v{version}_{suffix}"
146 ))
147}
148
149impl NcbiContigEntry {
150 #[must_use]
177 pub fn all_names_with_options(&self, generate_ucsc_names: bool) -> Vec<String> {
178 let mut names = vec![self.sequence_name.clone()];
179
180 if let Some(ref name) = self.genbank_accn {
182 if !name.is_empty() && name != "na" && !names.contains(name) {
183 names.push(name.clone());
184 }
185 }
186 if let Some(ref name) = self.refseq_accn {
187 if !name.is_empty() && name != "na" && !names.contains(name) {
188 names.push(name.clone());
189 }
190 }
191
192 let effective_ucsc_name = if let Some(ref name) = self.ucsc_name {
194 if !name.is_empty() && name != "na" {
195 Some(name.clone())
196 } else {
197 None
198 }
199 } else {
200 None
201 };
202
203 if let Some(ucsc_name) = effective_ucsc_name {
204 if !names.contains(&ucsc_name) {
205 names.push(ucsc_name);
206 }
207 } else if generate_ucsc_names {
208 if let (Some(ref genbank), Some(ref chromosome), Some(patch_type)) =
210 (&self.genbank_accn, &self.assigned_molecule, self.patch_type)
211 {
212 if !genbank.is_empty() && genbank != "na" && !chromosome.is_empty() {
213 if let Some(generated_name) =
214 generate_ucsc_patch_name(genbank, chromosome, patch_type)
215 {
216 if !names.contains(&generated_name) {
217 names.push(generated_name);
218 }
219 }
220 }
221 }
222 }
223
224 names
225 }
226
227 #[must_use]
232 pub fn to_contig(&self) -> Contig {
233 self.to_contig_with_options(true)
234 }
235
236 #[must_use]
244 pub fn to_contig_with_options(&self, generate_ucsc_names: bool) -> Contig {
245 let mut contig = Contig::new(&self.sequence_name, self.length);
246
247 let all = self.all_names_with_options(generate_ucsc_names);
249 if all.len() > 1 {
250 contig.aliases = all.into_iter().skip(1).collect();
251 }
252
253 if let Some(ref role_str) = self.role {
255 contig.sequence_role = SequenceRole::parse(role_str);
256 }
257
258 contig
259 }
260}
261
262#[allow(clippy::too_many_lines)]
269pub fn parse_ncbi_report_text(text: &str) -> Result<Vec<NcbiContigEntry>, ParseError> {
270 let mut entries = Vec::new();
271 let mut header_map: HashMap<String, usize> = HashMap::new();
273 let mut found_header = false;
274
275 for line in text.lines() {
276 if line.starts_with('#') {
278 let line_lower = line.to_lowercase();
281 if line_lower.contains("sequence-name") {
282 let header_line = line.trim_start_matches('#').trim();
283 for (idx, col) in header_line.split('\t').enumerate() {
284 header_map.insert(col.trim().to_lowercase(), idx);
286 }
287 found_header = true;
288 }
289 continue;
290 }
291
292 if line.trim().is_empty() {
293 continue;
294 }
295
296 if !found_header {
297 return Err(ParseError::InvalidFormat(
298 "NCBI assembly report header not found".to_string(),
299 ));
300 }
301
302 let fields: Vec<&str> = line.split('\t').collect();
303
304 let seq_name_idx = header_map
306 .get("sequence-name")
307 .ok_or_else(|| ParseError::InvalidFormat("Missing Sequence-Name column".to_string()))?;
308 let length_idx = header_map.get("sequence-length").ok_or_else(|| {
309 ParseError::InvalidFormat("Missing Sequence-Length column".to_string())
310 })?;
311
312 if fields.len() <= *seq_name_idx || fields.len() <= *length_idx {
313 continue; }
315
316 let sequence_name = fields[*seq_name_idx].trim().to_string();
317 let length: u64 = fields[*length_idx].trim().parse().map_err(|_| {
318 ParseError::InvalidFormat(format!(
319 "Invalid length for '{}': {}",
320 sequence_name, fields[*length_idx]
321 ))
322 })?;
323
324 let get_optional = |name: &str| -> Option<String> {
326 header_map
327 .get(name)
328 .and_then(|&idx| {
329 fields.get(idx).map(|s| {
330 let s = s.trim();
331 if s.is_empty() || s == "na" {
332 None
333 } else {
334 Some(s.to_string())
335 }
336 })
337 })
338 .flatten()
339 };
340
341 let get_raw_optional = |name: &str| -> Option<String> {
343 header_map
344 .get(name)
345 .and_then(|&idx| {
346 fields.get(idx).map(|s| {
347 let s = s.trim();
348 if s.is_empty() {
349 None
350 } else {
351 Some(s.to_string())
352 }
353 })
354 })
355 .flatten()
356 };
357
358 let role = get_raw_optional("sequence-role");
360 let patch_type = role.as_ref().and_then(|r| {
361 let r_lower = r.to_lowercase();
362 if r_lower == "fix-patch" {
363 Some(PatchType::Fix)
364 } else if r_lower == "novel-patch" {
365 Some(PatchType::Novel)
366 } else {
367 None
368 }
369 });
370
371 let assigned_molecule = get_optional("assigned-molecule");
373
374 if check_contig_limit(entries.len()).is_some() {
376 return Err(ParseError::TooManyContigs(entries.len()));
377 }
378
379 entries.push(NcbiContigEntry {
380 sequence_name,
381 length,
382 genbank_accn: get_optional("genbank-accn"),
384 refseq_accn: get_optional("refseq-accn"),
385 ucsc_name: get_raw_optional("ucsc-style-name"), role,
387 assigned_molecule,
388 patch_type,
389 });
390 }
391
392 if entries.is_empty() {
393 return Err(ParseError::InvalidFormat(
394 "No contigs found in NCBI assembly report".to_string(),
395 ));
396 }
397
398 Ok(entries)
399}
400
401#[cfg(test)]
402mod tests {
403 use super::*;
404
405 #[test]
406 fn test_parse_ncbi_report() {
407 let report = r"# Assembly name: GRCh38.p14
409# Organism name: Homo sapiens
410# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
4111 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1
4122 assembled-molecule 2 Chromosome CM000664.2 = NC_000002.12 Primary Assembly 242193529 chr2
413MT assembled-molecule MT Mitochondrion J01415.2 = NC_012920.1 non-nuclear 16569 chrM
414";
415
416 let entries = parse_ncbi_report_text(report).unwrap();
417 assert_eq!(entries.len(), 3);
418
419 let chr1 = &entries[0];
421 assert_eq!(chr1.sequence_name, "1");
422 assert_eq!(chr1.length, 248_956_422);
423 assert_eq!(chr1.genbank_accn, Some("CM000663.2".to_string()));
424 assert_eq!(chr1.refseq_accn, Some("NC_000001.11".to_string()));
425 assert_eq!(chr1.ucsc_name, Some("chr1".to_string()));
426 assert_eq!(chr1.assigned_molecule, Some("1".to_string()));
427 assert!(chr1.patch_type.is_none()); let names = chr1.all_names_with_options(true);
431 assert_eq!(names.len(), 4); assert!(names.contains(&"1".to_string()));
433 assert!(names.contains(&"chr1".to_string()));
434 assert!(names.contains(&"NC_000001.11".to_string()));
435
436 let mt = &entries[2];
438 assert_eq!(mt.sequence_name, "MT");
439 assert_eq!(mt.length, 16569);
440 assert_eq!(mt.ucsc_name, Some("chrM".to_string()));
441 }
442
443 #[test]
444 fn test_ncbi_entry_to_contig() {
445 let entry = NcbiContigEntry {
446 sequence_name: "1".to_string(),
447 length: 248_956_422,
448 genbank_accn: Some("CM000663.2".to_string()),
449 refseq_accn: Some("NC_000001.11".to_string()),
450 ucsc_name: Some("chr1".to_string()),
451 role: Some("assembled-molecule".to_string()),
452 assigned_molecule: Some("1".to_string()),
453 patch_type: None,
454 };
455
456 let contig = entry.to_contig();
457 assert_eq!(contig.name, "1");
458 assert_eq!(contig.length, 248_956_422);
459 assert_eq!(contig.aliases.len(), 3); assert!(contig.aliases.contains(&"chr1".to_string()));
461 assert!(contig.aliases.contains(&"NC_000001.11".to_string()));
462 }
463
464 #[test]
465 fn test_parse_ncbi_report_no_header() {
466 let report = "1\tassembled-molecule\t1\t248956422\n";
467 let result = parse_ncbi_report_text(report);
468 assert!(result.is_err());
469 }
470
471 #[test]
476 fn test_generate_ucsc_patch_name_fix() {
477 let name = generate_ucsc_patch_name("KN196472.1", "1", PatchType::Fix);
479 assert_eq!(name, Some("chr1_KN196472v1_fix".to_string()));
480
481 let name = generate_ucsc_patch_name("KZ208915.1", "8", PatchType::Fix);
483 assert_eq!(name, Some("chr8_KZ208915v1_fix".to_string()));
484
485 let name = generate_ucsc_patch_name("KN196487.1", "Y", PatchType::Fix);
487 assert_eq!(name, Some("chrY_KN196487v1_fix".to_string()));
488 }
489
490 #[test]
491 fn test_generate_ucsc_patch_name_novel() {
492 let name = generate_ucsc_patch_name("KQ458382.1", "1", PatchType::Novel);
494 assert_eq!(name, Some("chr1_KQ458382v1_alt".to_string()));
495
496 let name = generate_ucsc_patch_name("KV766199.1", "X", PatchType::Novel);
498 assert_eq!(name, Some("chrX_KV766199v1_alt".to_string()));
499 }
500
501 #[test]
502 fn test_generate_ucsc_patch_name_version_2() {
503 let name = generate_ucsc_patch_name("GL000256.2", "6", PatchType::Novel);
505 assert_eq!(name, Some("chr6_GL000256v2_alt".to_string()));
506 }
507
508 #[test]
509 fn test_generate_ucsc_patch_name_invalid_format() {
510 let name = generate_ucsc_patch_name("KN196472", "1", PatchType::Fix);
512 assert_eq!(name, None);
513
514 let name = generate_ucsc_patch_name("KN196472.1.2", "1", PatchType::Fix);
516 assert_eq!(name, None);
517
518 let name = generate_ucsc_patch_name("KN196472.a", "1", PatchType::Fix);
520 assert_eq!(name, None);
521
522 let name = generate_ucsc_patch_name("", "1", PatchType::Fix);
524 assert_eq!(name, None);
525 }
526
527 #[test]
528 fn test_parse_ncbi_report_with_patches() {
529 let report = r"# Assembly name: GRCh38.p12
531# Organism name: Homo sapiens
532# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
5331 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1
534HG986_PATCH fix-patch 1 Chromosome KN196472.1 = NW_009646194.1 PATCHES 186494 na
535HSCHR1_3_CTG3 novel-patch 1 Chromosome KQ458382.1 = NW_014040925.1 PATCHES 141019 na
536";
537
538 let entries = parse_ncbi_report_text(report).unwrap();
539 assert_eq!(entries.len(), 3);
540
541 let fix_patch = &entries[1];
543 assert_eq!(fix_patch.sequence_name, "HG986_PATCH");
544 assert_eq!(fix_patch.patch_type, Some(PatchType::Fix));
545 assert_eq!(fix_patch.assigned_molecule, Some("1".to_string()));
546 assert_eq!(fix_patch.genbank_accn, Some("KN196472.1".to_string()));
547 assert_eq!(fix_patch.ucsc_name, Some("na".to_string())); let novel_patch = &entries[2];
551 assert_eq!(novel_patch.sequence_name, "HSCHR1_3_CTG3");
552 assert_eq!(novel_patch.patch_type, Some(PatchType::Novel));
553 assert_eq!(novel_patch.assigned_molecule, Some("1".to_string()));
554 }
555
556 #[test]
557 fn test_all_names_with_ucsc_generation_enabled() {
558 let entry = NcbiContigEntry {
560 sequence_name: "HG986_PATCH".to_string(),
561 length: 186_494,
562 genbank_accn: Some("KN196472.1".to_string()),
563 refseq_accn: Some("NW_009646194.1".to_string()),
564 ucsc_name: Some("na".to_string()), role: Some("fix-patch".to_string()),
566 assigned_molecule: Some("1".to_string()),
567 patch_type: Some(PatchType::Fix),
568 };
569
570 let names = entry.all_names_with_options(true);
572 assert!(
573 names.contains(&"chr1_KN196472v1_fix".to_string()),
574 "Generated UCSC name should be present: {names:?}"
575 );
576 assert!(names.contains(&"HG986_PATCH".to_string()));
577 assert!(names.contains(&"KN196472.1".to_string()));
578 assert!(names.contains(&"NW_009646194.1".to_string()));
579 }
580
581 #[test]
582 fn test_all_names_with_ucsc_generation_disabled() {
583 let entry = NcbiContigEntry {
585 sequence_name: "HG986_PATCH".to_string(),
586 length: 186_494,
587 genbank_accn: Some("KN196472.1".to_string()),
588 refseq_accn: Some("NW_009646194.1".to_string()),
589 ucsc_name: Some("na".to_string()),
590 role: Some("fix-patch".to_string()),
591 assigned_molecule: Some("1".to_string()),
592 patch_type: Some(PatchType::Fix),
593 };
594
595 let names = entry.all_names_with_options(false);
597 assert!(
598 !names.contains(&"chr1_KN196472v1_fix".to_string()),
599 "Generated UCSC name should NOT be present: {names:?}"
600 );
601 assert!(names.contains(&"HG986_PATCH".to_string()));
602 assert!(names.contains(&"KN196472.1".to_string()));
603 }
604
605 #[test]
606 fn test_all_names_with_existing_ucsc_name() {
607 let entry = NcbiContigEntry {
609 sequence_name: "1".to_string(),
610 length: 248_956_422,
611 genbank_accn: Some("CM000663.2".to_string()),
612 refseq_accn: Some("NC_000001.11".to_string()),
613 ucsc_name: Some("chr1".to_string()), role: Some("assembled-molecule".to_string()),
615 assigned_molecule: Some("1".to_string()),
616 patch_type: None,
617 };
618
619 let names_enabled = entry.all_names_with_options(true);
621 let names_disabled = entry.all_names_with_options(false);
622
623 assert!(names_enabled.contains(&"chr1".to_string()));
624 assert!(names_disabled.contains(&"chr1".to_string()));
625 assert_eq!(names_enabled.len(), names_disabled.len());
626 }
627
628 #[test]
629 fn test_to_contig_with_ucsc_generation() {
630 let entry = NcbiContigEntry {
632 sequence_name: "HG986_PATCH".to_string(),
633 length: 186_494,
634 genbank_accn: Some("KN196472.1".to_string()),
635 refseq_accn: Some("NW_009646194.1".to_string()),
636 ucsc_name: Some("na".to_string()),
637 role: Some("fix-patch".to_string()),
638 assigned_molecule: Some("1".to_string()),
639 patch_type: Some(PatchType::Fix),
640 };
641
642 let contig = entry.to_contig_with_options(true);
644 assert!(
645 contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
646 "Contig aliases should include generated UCSC name: {:?}",
647 contig.aliases
648 );
649
650 let contig = entry.to_contig_with_options(false);
652 assert!(
653 !contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
654 "Contig aliases should NOT include generated UCSC name: {:?}",
655 contig.aliases
656 );
657 }
658
659 #[test]
660 fn test_novel_patch_ucsc_generation() {
661 let entry = NcbiContigEntry {
663 sequence_name: "HSCHR1_3_CTG3".to_string(),
664 length: 141_019,
665 genbank_accn: Some("KQ458382.1".to_string()),
666 refseq_accn: Some("NW_014040925.1".to_string()),
667 ucsc_name: Some("na".to_string()),
668 role: Some("novel-patch".to_string()),
669 assigned_molecule: Some("1".to_string()),
670 patch_type: Some(PatchType::Novel),
671 };
672
673 let names = entry.all_names_with_options(true);
674 assert!(
675 names.contains(&"chr1_KQ458382v1_alt".to_string()),
676 "Novel patch should have _alt suffix: {names:?}"
677 );
678 }
679
680 #[test]
681 fn test_ucsc_generation_for_different_chromosomes() {
682 let test_cases = vec![
684 ("1", "KN196472.1", PatchType::Fix, "chr1_KN196472v1_fix"),
685 ("X", "KV766199.1", PatchType::Novel, "chrX_KV766199v1_alt"),
686 ("Y", "KN196487.1", PatchType::Fix, "chrY_KN196487v1_fix"),
687 ("22", "KZ208920.1", PatchType::Fix, "chr22_KZ208920v1_fix"),
688 ];
689
690 for (chrom, accession, patch_type, expected) in test_cases {
691 let entry = NcbiContigEntry {
692 sequence_name: "TEST_PATCH".to_string(),
693 length: 1000,
694 genbank_accn: Some(accession.to_string()),
695 refseq_accn: None,
696 ucsc_name: Some("na".to_string()),
697 role: None,
698 assigned_molecule: Some(chrom.to_string()),
699 patch_type: Some(patch_type),
700 };
701
702 let names = entry.all_names_with_options(true);
703 assert!(
704 names.contains(&expected.to_string()),
705 "Expected {expected} for chromosome {chrom}, accession {accession}: {names:?}"
706 );
707 }
708 }
709}