use std::collections::HashMap;
use crate::core::contig::{Contig, SequenceRole};
use crate::parsing::sam::ParseError;
use crate::utils::validation::check_contig_limit;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PatchType {
Fix,
Novel,
}
#[derive(Debug, Clone)]
pub struct NcbiContigEntry {
pub sequence_name: String,
pub length: u64,
pub genbank_accn: Option<String>,
pub refseq_accn: Option<String>,
pub ucsc_name: Option<String>,
pub role: Option<String>,
pub assigned_molecule: Option<String>,
pub patch_type: Option<PatchType>,
}
#[must_use]
pub fn generate_ucsc_patch_name(
genbank_accession: &str,
chromosome: &str,
patch_type: PatchType,
) -> Option<String> {
let parts: Vec<&str> = genbank_accession.split('.').collect();
if parts.len() != 2 {
return None;
}
let accession_base = parts[0];
let version = parts[1];
if !version.chars().all(|c| c.is_ascii_digit()) {
return None;
}
if !accession_base
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '_')
{
return None;
}
let suffix = match patch_type {
PatchType::Fix => "fix",
PatchType::Novel => "alt",
};
Some(format!(
"chr{chromosome}_{accession_base}v{version}_{suffix}"
))
}
impl NcbiContigEntry {
#[must_use]
pub fn all_names_with_options(&self, generate_ucsc_names: bool) -> Vec<String> {
let mut names = vec![self.sequence_name.clone()];
if let Some(ref name) = self.genbank_accn {
if !name.is_empty() && name != "na" && !names.contains(name) {
names.push(name.clone());
}
}
if let Some(ref name) = self.refseq_accn {
if !name.is_empty() && name != "na" && !names.contains(name) {
names.push(name.clone());
}
}
let effective_ucsc_name = if let Some(ref name) = self.ucsc_name {
if !name.is_empty() && name != "na" {
Some(name.clone())
} else {
None
}
} else {
None
};
if let Some(ucsc_name) = effective_ucsc_name {
if !names.contains(&ucsc_name) {
names.push(ucsc_name);
}
} else if generate_ucsc_names {
if let (Some(ref genbank), Some(ref chromosome), Some(patch_type)) =
(&self.genbank_accn, &self.assigned_molecule, self.patch_type)
{
if !genbank.is_empty() && genbank != "na" && !chromosome.is_empty() {
if let Some(generated_name) =
generate_ucsc_patch_name(genbank, chromosome, patch_type)
{
if !names.contains(&generated_name) {
names.push(generated_name);
}
}
}
}
}
names
}
#[must_use]
pub fn to_contig(&self) -> Contig {
self.to_contig_with_options(true)
}
#[must_use]
pub fn to_contig_with_options(&self, generate_ucsc_names: bool) -> Contig {
let mut contig = Contig::new(&self.sequence_name, self.length);
let all = self.all_names_with_options(generate_ucsc_names);
if all.len() > 1 {
contig.aliases = all.into_iter().skip(1).collect();
}
if let Some(ref role_str) = self.role {
contig.sequence_role = SequenceRole::parse(role_str);
}
contig
}
}
#[allow(clippy::too_many_lines)]
pub fn parse_ncbi_report_text(text: &str) -> Result<Vec<NcbiContigEntry>, ParseError> {
let mut entries = Vec::new();
let mut header_map: HashMap<String, usize> = HashMap::new();
let mut found_header = false;
for line in text.lines() {
if line.starts_with('#') {
let line_lower = line.to_lowercase();
if line_lower.contains("sequence-name") {
let header_line = line.trim_start_matches('#').trim();
for (idx, col) in header_line.split('\t').enumerate() {
header_map.insert(col.trim().to_lowercase(), idx);
}
found_header = true;
}
continue;
}
if line.trim().is_empty() {
continue;
}
if !found_header {
return Err(ParseError::InvalidFormat(
"NCBI assembly report header not found".to_string(),
));
}
let fields: Vec<&str> = line.split('\t').collect();
let seq_name_idx = header_map
.get("sequence-name")
.ok_or_else(|| ParseError::InvalidFormat("Missing Sequence-Name column".to_string()))?;
let length_idx = header_map.get("sequence-length").ok_or_else(|| {
ParseError::InvalidFormat("Missing Sequence-Length column".to_string())
})?;
if fields.len() <= *seq_name_idx || fields.len() <= *length_idx {
continue; }
let sequence_name = fields[*seq_name_idx].trim().to_string();
let length: u64 = fields[*length_idx].trim().parse().map_err(|_| {
ParseError::InvalidFormat(format!(
"Invalid length for '{}': {}",
sequence_name, fields[*length_idx]
))
})?;
let get_optional = |name: &str| -> Option<String> {
header_map
.get(name)
.and_then(|&idx| {
fields.get(idx).map(|s| {
let s = s.trim();
if s.is_empty() || s == "na" {
None
} else {
Some(s.to_string())
}
})
})
.flatten()
};
let get_raw_optional = |name: &str| -> Option<String> {
header_map
.get(name)
.and_then(|&idx| {
fields.get(idx).map(|s| {
let s = s.trim();
if s.is_empty() {
None
} else {
Some(s.to_string())
}
})
})
.flatten()
};
let role = get_raw_optional("sequence-role");
let patch_type = role.as_ref().and_then(|r| {
let r_lower = r.to_lowercase();
if r_lower == "fix-patch" {
Some(PatchType::Fix)
} else if r_lower == "novel-patch" {
Some(PatchType::Novel)
} else {
None
}
});
let assigned_molecule = get_optional("assigned-molecule");
if check_contig_limit(entries.len()).is_some() {
return Err(ParseError::TooManyContigs(entries.len()));
}
entries.push(NcbiContigEntry {
sequence_name,
length,
genbank_accn: get_optional("genbank-accn"),
refseq_accn: get_optional("refseq-accn"),
ucsc_name: get_raw_optional("ucsc-style-name"), role,
assigned_molecule,
patch_type,
});
}
if entries.is_empty() {
return Err(ParseError::InvalidFormat(
"No contigs found in NCBI assembly report".to_string(),
));
}
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_ncbi_report() {
let report = r"# Assembly name: GRCh38.p14
# Organism name: Homo sapiens
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
1 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1
2 assembled-molecule 2 Chromosome CM000664.2 = NC_000002.12 Primary Assembly 242193529 chr2
MT assembled-molecule MT Mitochondrion J01415.2 = NC_012920.1 non-nuclear 16569 chrM
";
let entries = parse_ncbi_report_text(report).unwrap();
assert_eq!(entries.len(), 3);
let chr1 = &entries[0];
assert_eq!(chr1.sequence_name, "1");
assert_eq!(chr1.length, 248_956_422);
assert_eq!(chr1.genbank_accn, Some("CM000663.2".to_string()));
assert_eq!(chr1.refseq_accn, Some("NC_000001.11".to_string()));
assert_eq!(chr1.ucsc_name, Some("chr1".to_string()));
assert_eq!(chr1.assigned_molecule, Some("1".to_string()));
assert!(chr1.patch_type.is_none());
let names = chr1.all_names_with_options(true);
assert_eq!(names.len(), 4); assert!(names.contains(&"1".to_string()));
assert!(names.contains(&"chr1".to_string()));
assert!(names.contains(&"NC_000001.11".to_string()));
let mt = &entries[2];
assert_eq!(mt.sequence_name, "MT");
assert_eq!(mt.length, 16569);
assert_eq!(mt.ucsc_name, Some("chrM".to_string()));
}
#[test]
fn test_ncbi_entry_to_contig() {
let entry = NcbiContigEntry {
sequence_name: "1".to_string(),
length: 248_956_422,
genbank_accn: Some("CM000663.2".to_string()),
refseq_accn: Some("NC_000001.11".to_string()),
ucsc_name: Some("chr1".to_string()),
role: Some("assembled-molecule".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: None,
};
let contig = entry.to_contig();
assert_eq!(contig.name, "1");
assert_eq!(contig.length, 248_956_422);
assert_eq!(contig.aliases.len(), 3); assert!(contig.aliases.contains(&"chr1".to_string()));
assert!(contig.aliases.contains(&"NC_000001.11".to_string()));
}
#[test]
fn test_parse_ncbi_report_no_header() {
let report = "1\tassembled-molecule\t1\t248956422\n";
let result = parse_ncbi_report_text(report);
assert!(result.is_err());
}
#[test]
fn test_generate_ucsc_patch_name_fix() {
let name = generate_ucsc_patch_name("KN196472.1", "1", PatchType::Fix);
assert_eq!(name, Some("chr1_KN196472v1_fix".to_string()));
let name = generate_ucsc_patch_name("KZ208915.1", "8", PatchType::Fix);
assert_eq!(name, Some("chr8_KZ208915v1_fix".to_string()));
let name = generate_ucsc_patch_name("KN196487.1", "Y", PatchType::Fix);
assert_eq!(name, Some("chrY_KN196487v1_fix".to_string()));
}
#[test]
fn test_generate_ucsc_patch_name_novel() {
let name = generate_ucsc_patch_name("KQ458382.1", "1", PatchType::Novel);
assert_eq!(name, Some("chr1_KQ458382v1_alt".to_string()));
let name = generate_ucsc_patch_name("KV766199.1", "X", PatchType::Novel);
assert_eq!(name, Some("chrX_KV766199v1_alt".to_string()));
}
#[test]
fn test_generate_ucsc_patch_name_version_2() {
let name = generate_ucsc_patch_name("GL000256.2", "6", PatchType::Novel);
assert_eq!(name, Some("chr6_GL000256v2_alt".to_string()));
}
#[test]
fn test_generate_ucsc_patch_name_invalid_format() {
let name = generate_ucsc_patch_name("KN196472", "1", PatchType::Fix);
assert_eq!(name, None);
let name = generate_ucsc_patch_name("KN196472.1.2", "1", PatchType::Fix);
assert_eq!(name, None);
let name = generate_ucsc_patch_name("KN196472.a", "1", PatchType::Fix);
assert_eq!(name, None);
let name = generate_ucsc_patch_name("", "1", PatchType::Fix);
assert_eq!(name, None);
}
#[test]
fn test_parse_ncbi_report_with_patches() {
let report = r"# Assembly name: GRCh38.p12
# Organism name: Homo sapiens
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
1 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1
HG986_PATCH fix-patch 1 Chromosome KN196472.1 = NW_009646194.1 PATCHES 186494 na
HSCHR1_3_CTG3 novel-patch 1 Chromosome KQ458382.1 = NW_014040925.1 PATCHES 141019 na
";
let entries = parse_ncbi_report_text(report).unwrap();
assert_eq!(entries.len(), 3);
let fix_patch = &entries[1];
assert_eq!(fix_patch.sequence_name, "HG986_PATCH");
assert_eq!(fix_patch.patch_type, Some(PatchType::Fix));
assert_eq!(fix_patch.assigned_molecule, Some("1".to_string()));
assert_eq!(fix_patch.genbank_accn, Some("KN196472.1".to_string()));
assert_eq!(fix_patch.ucsc_name, Some("na".to_string()));
let novel_patch = &entries[2];
assert_eq!(novel_patch.sequence_name, "HSCHR1_3_CTG3");
assert_eq!(novel_patch.patch_type, Some(PatchType::Novel));
assert_eq!(novel_patch.assigned_molecule, Some("1".to_string()));
}
#[test]
fn test_all_names_with_ucsc_generation_enabled() {
let entry = NcbiContigEntry {
sequence_name: "HG986_PATCH".to_string(),
length: 186_494,
genbank_accn: Some("KN196472.1".to_string()),
refseq_accn: Some("NW_009646194.1".to_string()),
ucsc_name: Some("na".to_string()), role: Some("fix-patch".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: Some(PatchType::Fix),
};
let names = entry.all_names_with_options(true);
assert!(
names.contains(&"chr1_KN196472v1_fix".to_string()),
"Generated UCSC name should be present: {names:?}"
);
assert!(names.contains(&"HG986_PATCH".to_string()));
assert!(names.contains(&"KN196472.1".to_string()));
assert!(names.contains(&"NW_009646194.1".to_string()));
}
#[test]
fn test_all_names_with_ucsc_generation_disabled() {
let entry = NcbiContigEntry {
sequence_name: "HG986_PATCH".to_string(),
length: 186_494,
genbank_accn: Some("KN196472.1".to_string()),
refseq_accn: Some("NW_009646194.1".to_string()),
ucsc_name: Some("na".to_string()),
role: Some("fix-patch".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: Some(PatchType::Fix),
};
let names = entry.all_names_with_options(false);
assert!(
!names.contains(&"chr1_KN196472v1_fix".to_string()),
"Generated UCSC name should NOT be present: {names:?}"
);
assert!(names.contains(&"HG986_PATCH".to_string()));
assert!(names.contains(&"KN196472.1".to_string()));
}
#[test]
fn test_all_names_with_existing_ucsc_name() {
let entry = NcbiContigEntry {
sequence_name: "1".to_string(),
length: 248_956_422,
genbank_accn: Some("CM000663.2".to_string()),
refseq_accn: Some("NC_000001.11".to_string()),
ucsc_name: Some("chr1".to_string()), role: Some("assembled-molecule".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: None,
};
let names_enabled = entry.all_names_with_options(true);
let names_disabled = entry.all_names_with_options(false);
assert!(names_enabled.contains(&"chr1".to_string()));
assert!(names_disabled.contains(&"chr1".to_string()));
assert_eq!(names_enabled.len(), names_disabled.len());
}
#[test]
fn test_to_contig_with_ucsc_generation() {
let entry = NcbiContigEntry {
sequence_name: "HG986_PATCH".to_string(),
length: 186_494,
genbank_accn: Some("KN196472.1".to_string()),
refseq_accn: Some("NW_009646194.1".to_string()),
ucsc_name: Some("na".to_string()),
role: Some("fix-patch".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: Some(PatchType::Fix),
};
let contig = entry.to_contig_with_options(true);
assert!(
contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
"Contig aliases should include generated UCSC name: {:?}",
contig.aliases
);
let contig = entry.to_contig_with_options(false);
assert!(
!contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
"Contig aliases should NOT include generated UCSC name: {:?}",
contig.aliases
);
}
#[test]
fn test_novel_patch_ucsc_generation() {
let entry = NcbiContigEntry {
sequence_name: "HSCHR1_3_CTG3".to_string(),
length: 141_019,
genbank_accn: Some("KQ458382.1".to_string()),
refseq_accn: Some("NW_014040925.1".to_string()),
ucsc_name: Some("na".to_string()),
role: Some("novel-patch".to_string()),
assigned_molecule: Some("1".to_string()),
patch_type: Some(PatchType::Novel),
};
let names = entry.all_names_with_options(true);
assert!(
names.contains(&"chr1_KQ458382v1_alt".to_string()),
"Novel patch should have _alt suffix: {names:?}"
);
}
#[test]
fn test_ucsc_generation_for_different_chromosomes() {
let test_cases = vec![
("1", "KN196472.1", PatchType::Fix, "chr1_KN196472v1_fix"),
("X", "KV766199.1", PatchType::Novel, "chrX_KV766199v1_alt"),
("Y", "KN196487.1", PatchType::Fix, "chrY_KN196487v1_fix"),
("22", "KZ208920.1", PatchType::Fix, "chr22_KZ208920v1_fix"),
];
for (chrom, accession, patch_type, expected) in test_cases {
let entry = NcbiContigEntry {
sequence_name: "TEST_PATCH".to_string(),
length: 1000,
genbank_accn: Some(accession.to_string()),
refseq_accn: None,
ucsc_name: Some("na".to_string()),
role: None,
assigned_molecule: Some(chrom.to_string()),
patch_type: Some(patch_type),
};
let names = entry.all_names_with_options(true);
assert!(
names.contains(&expected.to_string()),
"Expected {expected} for chromosome {chrom}, accession {accession}: {names:?}"
);
}
}
}