Skip to main content

ref_solver/core/
contig.rs

1use serde::{Deserialize, Serialize};
2
3use crate::core::types::NamingConvention;
4
5/// Sequence role from NCBI assembly report
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
7#[serde(rename_all = "kebab-case")]
8pub enum SequenceRole {
9    /// Primary chromosome (1-22, X, Y, MT)
10    AssembledMolecule,
11    /// Alternate locus scaffold
12    AltScaffold,
13    /// Fix patch (error correction)
14    FixPatch,
15    /// Novel patch (new sequence)
16    NovelPatch,
17    /// Unlocalized scaffold (known chromosome, unknown location)
18    UnlocalizedScaffold,
19    /// Unplaced scaffold (unknown chromosome)
20    UnplacedScaffold,
21    /// Role not specified or unknown
22    #[default]
23    Unknown,
24}
25
26impl SequenceRole {
27    /// Parse a sequence role from string representation (e.g. from NCBI assembly report)
28    #[must_use]
29    pub fn parse(s: &str) -> Self {
30        match s.to_lowercase().as_str() {
31            "assembled-molecule" => SequenceRole::AssembledMolecule,
32            "alt-scaffold" => SequenceRole::AltScaffold,
33            "fix-patch" => SequenceRole::FixPatch,
34            "novel-patch" => SequenceRole::NovelPatch,
35            "unlocalized-scaffold" => SequenceRole::UnlocalizedScaffold,
36            "unplaced-scaffold" => SequenceRole::UnplacedScaffold,
37            _ => SequenceRole::Unknown,
38        }
39    }
40}
41
42/// A single contig/sequence in a reference genome
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44#[non_exhaustive]
45pub struct Contig {
46    /// Sequence name (SN tag in SAM)
47    pub name: String,
48
49    /// Sequence length (LN tag in SAM)
50    pub length: u64,
51
52    /// MD5 checksum of the sequence (M5 tag in SAM)
53    /// Lowercase hex, 32 characters
54    #[serde(default, skip_serializing_if = "Option::is_none")]
55    pub md5: Option<String>,
56
57    /// GA4GH sha512t24u digest (SHA-512, truncate to 24 bytes, base64url no-pad)
58    /// 32-character string, used for refget/seqcol compatibility
59    #[serde(default, skip_serializing_if = "Option::is_none")]
60    pub sha512t24u: Option<String>,
61
62    /// Assembly identifier (AS tag in SAM)
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub assembly: Option<String>,
65
66    /// URI where sequence can be retrieved (UR tag in SAM)
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub uri: Option<String>,
69
70    /// Species (SP tag in SAM)
71    #[serde(default, skip_serializing_if = "Option::is_none")]
72    pub species: Option<String>,
73
74    /// Known alternative names for this contig
75    #[serde(default, skip_serializing_if = "Vec::is_empty")]
76    pub aliases: Vec<String>,
77
78    /// Sequence role from NCBI assembly report
79    #[serde(default, skip_serializing_if = "is_unknown_role")]
80    pub sequence_role: SequenceRole,
81}
82
83#[allow(clippy::trivially_copy_pass_by_ref)] // Required signature for serde skip_serializing_if
84fn is_unknown_role(role: &SequenceRole) -> bool {
85    matches!(role, SequenceRole::Unknown)
86}
87
88impl Contig {
89    pub fn new(name: impl Into<String>, length: u64) -> Self {
90        Self {
91            name: name.into(),
92            length,
93            md5: None,
94            sha512t24u: None,
95            assembly: None,
96            uri: None,
97            species: None,
98            aliases: Vec::new(),
99            sequence_role: SequenceRole::Unknown,
100        }
101    }
102
103    #[cfg(test)]
104    #[must_use]
105    pub fn with_md5(mut self, md5: impl Into<String>) -> Self {
106        self.md5 = Some(md5.into());
107        self
108    }
109
110    #[cfg(test)]
111    #[must_use]
112    pub fn with_aliases(mut self, aliases: Vec<String>) -> Self {
113        self.aliases = aliases;
114        self
115    }
116
117    /// Check if this contig is a primary chromosome (1-22, X, Y)
118    /// Matches both UCSC (chr1) and NCBI (1) naming conventions exactly
119    #[must_use]
120    pub fn is_primary_chromosome(&self) -> bool {
121        // NCBI style: 1-22, X, Y
122        // UCSC style: chr1-chr22, chrX, chrY
123        matches!(
124            self.name.as_str(),
125            "1" | "2"
126                | "3"
127                | "4"
128                | "5"
129                | "6"
130                | "7"
131                | "8"
132                | "9"
133                | "10"
134                | "11"
135                | "12"
136                | "13"
137                | "14"
138                | "15"
139                | "16"
140                | "17"
141                | "18"
142                | "19"
143                | "20"
144                | "21"
145                | "22"
146                | "X"
147                | "Y"
148                | "chr1"
149                | "chr2"
150                | "chr3"
151                | "chr4"
152                | "chr5"
153                | "chr6"
154                | "chr7"
155                | "chr8"
156                | "chr9"
157                | "chr10"
158                | "chr11"
159                | "chr12"
160                | "chr13"
161                | "chr14"
162                | "chr15"
163                | "chr16"
164                | "chr17"
165                | "chr18"
166                | "chr19"
167                | "chr20"
168                | "chr21"
169                | "chr22"
170                | "chrX"
171                | "chrY"
172        )
173    }
174
175    /// Check if this is a mitochondrial contig
176    /// Matches common mitochondrial names from various reference builds
177    #[must_use]
178    pub fn is_mitochondrial(&self) -> bool {
179        let name_lower = self.name.to_lowercase();
180        matches!(
181            name_lower.as_str(),
182            "mt" | "m" | "chrm" | "chrmt" | "mito" | "mitochondrion" | "rcrs" | "nc_012920.1"
183        ) || name_lower.contains("mitochon")
184    }
185
186    /// Check if this is an ALT contig (`GRCh38`)
187    #[must_use]
188    pub fn is_alt(&self) -> bool {
189        self.name.ends_with("_alt") || self.name.contains("_alt_")
190    }
191
192    /// Check if this is a decoy contig
193    #[must_use]
194    pub fn is_decoy(&self) -> bool {
195        self.name.contains("decoy")
196            || self.name == "hs37d5"
197            || self.name.starts_with("chrUn_")
198            || self.name.contains("_random")
199    }
200}
201
202// NOTE: normalize_contig_name() was removed.
203// Name equivalence is now defined ONLY through explicit aliases (AN tag in SAM/dict,
204// or NCBI assembly report columns). Matching uses exact names.
205
206/// Detect the naming convention used by a set of contigs
207#[must_use]
208pub fn detect_naming_convention(contigs: &[Contig]) -> NamingConvention {
209    let mut has_chr_prefix = false;
210    let mut has_no_prefix = false;
211
212    for contig in contigs {
213        if contig.is_primary_chromosome() {
214            if contig.name.starts_with("chr") {
215                has_chr_prefix = true;
216            } else {
217                has_no_prefix = true;
218            }
219        }
220    }
221
222    match (has_chr_prefix, has_no_prefix) {
223        (true, false) => NamingConvention::Ucsc,
224        (false, true) => NamingConvention::Ncbi,
225        _ => NamingConvention::Mixed,
226    }
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_is_primary_chromosome() {
235        assert!(Contig::new("chr1", 100).is_primary_chromosome());
236        assert!(Contig::new("1", 100).is_primary_chromosome());
237        assert!(Contig::new("chrX", 100).is_primary_chromosome());
238        assert!(Contig::new("Y", 100).is_primary_chromosome());
239        assert!(!Contig::new("chrM", 100).is_primary_chromosome());
240        assert!(!Contig::new("chr1_random", 100).is_primary_chromosome());
241    }
242
243    #[test]
244    fn test_is_mitochondrial() {
245        // Standard names
246        assert!(Contig::new("chrM", 100).is_mitochondrial());
247        assert!(Contig::new("MT", 100).is_mitochondrial());
248        assert!(Contig::new("chrMT", 100).is_mitochondrial());
249        assert!(Contig::new("M", 100).is_mitochondrial());
250        // Extended names from older references
251        assert!(Contig::new("mito", 100).is_mitochondrial());
252        assert!(Contig::new("Mitochondrion", 100).is_mitochondrial());
253        assert!(Contig::new("rCRS", 100).is_mitochondrial());
254        assert!(Contig::new("NC_012920.1", 100).is_mitochondrial());
255        // Substring match
256        assert!(Contig::new("mitochondrial_genome", 100).is_mitochondrial());
257        // Non-mitochondrial
258        assert!(!Contig::new("chr1", 100).is_mitochondrial());
259        assert!(!Contig::new("chrX", 100).is_mitochondrial());
260    }
261}