Skip to main content

ref_solver/core/
contig.rs

1use serde::{Deserialize, Serialize};
2
3use crate::core::types::NamingConvention;
4
5/// Sequence role from NCBI assembly report
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
7#[serde(rename_all = "kebab-case")]
8pub enum SequenceRole {
9    /// Primary chromosome (1-22, X, Y, MT)
10    AssembledMolecule,
11    /// Alternate locus scaffold
12    AltScaffold,
13    /// Fix patch (error correction)
14    FixPatch,
15    /// Novel patch (new sequence)
16    NovelPatch,
17    /// Unlocalized scaffold (known chromosome, unknown location)
18    UnlocalizedScaffold,
19    /// Unplaced scaffold (unknown chromosome)
20    UnplacedScaffold,
21    /// Role not specified or unknown
22    #[default]
23    Unknown,
24}
25
26impl SequenceRole {
27    /// Parse a sequence role from string representation (e.g. from NCBI assembly report)
28    #[must_use]
29    pub fn parse(s: &str) -> Self {
30        match s.to_lowercase().as_str() {
31            "assembled-molecule" => SequenceRole::AssembledMolecule,
32            "alt-scaffold" => SequenceRole::AltScaffold,
33            "fix-patch" => SequenceRole::FixPatch,
34            "novel-patch" => SequenceRole::NovelPatch,
35            "unlocalized-scaffold" => SequenceRole::UnlocalizedScaffold,
36            "unplaced-scaffold" => SequenceRole::UnplacedScaffold,
37            _ => SequenceRole::Unknown,
38        }
39    }
40}
41
42/// A single contig/sequence in a reference genome
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub struct Contig {
45    /// Sequence name (SN tag in SAM)
46    pub name: String,
47
48    /// Sequence length (LN tag in SAM)
49    pub length: u64,
50
51    /// MD5 checksum of the sequence (M5 tag in SAM)
52    /// Lowercase hex, 32 characters
53    #[serde(default, skip_serializing_if = "Option::is_none")]
54    pub md5: Option<String>,
55
56    /// Assembly identifier (AS tag in SAM)
57    #[serde(default, skip_serializing_if = "Option::is_none")]
58    pub assembly: Option<String>,
59
60    /// URI where sequence can be retrieved (UR tag in SAM)
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub uri: Option<String>,
63
64    /// Species (SP tag in SAM)
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub species: Option<String>,
67
68    /// Known alternative names for this contig
69    #[serde(default, skip_serializing_if = "Vec::is_empty")]
70    pub aliases: Vec<String>,
71
72    /// Sequence role from NCBI assembly report
73    #[serde(default, skip_serializing_if = "is_unknown_role")]
74    pub sequence_role: SequenceRole,
75}
76
77#[allow(clippy::trivially_copy_pass_by_ref)] // Required signature for serde skip_serializing_if
78fn is_unknown_role(role: &SequenceRole) -> bool {
79    matches!(role, SequenceRole::Unknown)
80}
81
82impl Contig {
83    pub fn new(name: impl Into<String>, length: u64) -> Self {
84        Self {
85            name: name.into(),
86            length,
87            md5: None,
88            assembly: None,
89            uri: None,
90            species: None,
91            aliases: Vec::new(),
92            sequence_role: SequenceRole::Unknown,
93        }
94    }
95
96    #[cfg(test)]
97    #[must_use]
98    pub fn with_md5(mut self, md5: impl Into<String>) -> Self {
99        self.md5 = Some(md5.into());
100        self
101    }
102
103    #[cfg(test)]
104    #[must_use]
105    pub fn with_aliases(mut self, aliases: Vec<String>) -> Self {
106        self.aliases = aliases;
107        self
108    }
109
110    /// Check if this contig is a primary chromosome (1-22, X, Y)
111    /// Matches both UCSC (chr1) and NCBI (1) naming conventions exactly
112    #[must_use]
113    pub fn is_primary_chromosome(&self) -> bool {
114        // NCBI style: 1-22, X, Y
115        // UCSC style: chr1-chr22, chrX, chrY
116        matches!(
117            self.name.as_str(),
118            "1" | "2"
119                | "3"
120                | "4"
121                | "5"
122                | "6"
123                | "7"
124                | "8"
125                | "9"
126                | "10"
127                | "11"
128                | "12"
129                | "13"
130                | "14"
131                | "15"
132                | "16"
133                | "17"
134                | "18"
135                | "19"
136                | "20"
137                | "21"
138                | "22"
139                | "X"
140                | "Y"
141                | "chr1"
142                | "chr2"
143                | "chr3"
144                | "chr4"
145                | "chr5"
146                | "chr6"
147                | "chr7"
148                | "chr8"
149                | "chr9"
150                | "chr10"
151                | "chr11"
152                | "chr12"
153                | "chr13"
154                | "chr14"
155                | "chr15"
156                | "chr16"
157                | "chr17"
158                | "chr18"
159                | "chr19"
160                | "chr20"
161                | "chr21"
162                | "chr22"
163                | "chrX"
164                | "chrY"
165        )
166    }
167
168    /// Check if this is a mitochondrial contig
169    /// Matches common mitochondrial names from various reference builds
170    #[must_use]
171    pub fn is_mitochondrial(&self) -> bool {
172        let name_lower = self.name.to_lowercase();
173        matches!(
174            name_lower.as_str(),
175            "mt" | "m" | "chrm" | "chrmt" | "mito" | "mitochondrion" | "rcrs" | "nc_012920.1"
176        ) || name_lower.contains("mitochon")
177    }
178
179    /// Check if this is an ALT contig (`GRCh38`)
180    #[must_use]
181    pub fn is_alt(&self) -> bool {
182        self.name.ends_with("_alt") || self.name.contains("_alt_")
183    }
184
185    /// Check if this is a decoy contig
186    #[must_use]
187    pub fn is_decoy(&self) -> bool {
188        self.name.contains("decoy")
189            || self.name == "hs37d5"
190            || self.name.starts_with("chrUn_")
191            || self.name.contains("_random")
192    }
193}
194
195// NOTE: normalize_contig_name() was removed.
196// Name equivalence is now defined ONLY through explicit aliases (AN tag in SAM/dict,
197// or NCBI assembly report columns). Matching uses exact names.
198
199/// Detect the naming convention used by a set of contigs
200#[must_use]
201pub fn detect_naming_convention(contigs: &[Contig]) -> NamingConvention {
202    let mut has_chr_prefix = false;
203    let mut has_no_prefix = false;
204
205    for contig in contigs {
206        if contig.is_primary_chromosome() {
207            if contig.name.starts_with("chr") {
208                has_chr_prefix = true;
209            } else {
210                has_no_prefix = true;
211            }
212        }
213    }
214
215    match (has_chr_prefix, has_no_prefix) {
216        (true, false) => NamingConvention::Ucsc,
217        (false, true) => NamingConvention::Ncbi,
218        _ => NamingConvention::Mixed,
219    }
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn test_is_primary_chromosome() {
228        assert!(Contig::new("chr1", 100).is_primary_chromosome());
229        assert!(Contig::new("1", 100).is_primary_chromosome());
230        assert!(Contig::new("chrX", 100).is_primary_chromosome());
231        assert!(Contig::new("Y", 100).is_primary_chromosome());
232        assert!(!Contig::new("chrM", 100).is_primary_chromosome());
233        assert!(!Contig::new("chr1_random", 100).is_primary_chromosome());
234    }
235
236    #[test]
237    fn test_is_mitochondrial() {
238        // Standard names
239        assert!(Contig::new("chrM", 100).is_mitochondrial());
240        assert!(Contig::new("MT", 100).is_mitochondrial());
241        assert!(Contig::new("chrMT", 100).is_mitochondrial());
242        assert!(Contig::new("M", 100).is_mitochondrial());
243        // Extended names from older references
244        assert!(Contig::new("mito", 100).is_mitochondrial());
245        assert!(Contig::new("Mitochondrion", 100).is_mitochondrial());
246        assert!(Contig::new("rCRS", 100).is_mitochondrial());
247        assert!(Contig::new("NC_012920.1", 100).is_mitochondrial());
248        // Substring match
249        assert!(Contig::new("mitochondrial_genome", 100).is_mitochondrial());
250        // Non-mitochondrial
251        assert!(!Contig::new("chr1", 100).is_mitochondrial());
252        assert!(!Contig::new("chrX", 100).is_mitochondrial());
253    }
254}