ref_solver/core/
contig.rs1use serde::{Deserialize, Serialize};
2
3use crate::core::types::NamingConvention;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
7#[serde(rename_all = "kebab-case")]
8pub enum SequenceRole {
9 AssembledMolecule,
11 AltScaffold,
13 FixPatch,
15 NovelPatch,
17 UnlocalizedScaffold,
19 UnplacedScaffold,
21 #[default]
23 Unknown,
24}
25
26impl SequenceRole {
27 #[must_use]
29 pub fn parse(s: &str) -> Self {
30 match s.to_lowercase().as_str() {
31 "assembled-molecule" => SequenceRole::AssembledMolecule,
32 "alt-scaffold" => SequenceRole::AltScaffold,
33 "fix-patch" => SequenceRole::FixPatch,
34 "novel-patch" => SequenceRole::NovelPatch,
35 "unlocalized-scaffold" => SequenceRole::UnlocalizedScaffold,
36 "unplaced-scaffold" => SequenceRole::UnplacedScaffold,
37 _ => SequenceRole::Unknown,
38 }
39 }
40}
41
42#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44#[non_exhaustive]
45pub struct Contig {
46 pub name: String,
48
49 pub length: u64,
51
52 #[serde(default, skip_serializing_if = "Option::is_none")]
55 pub md5: Option<String>,
56
57 #[serde(default, skip_serializing_if = "Option::is_none")]
60 pub sha512t24u: Option<String>,
61
62 #[serde(default, skip_serializing_if = "Option::is_none")]
64 pub assembly: Option<String>,
65
66 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub uri: Option<String>,
69
70 #[serde(default, skip_serializing_if = "Option::is_none")]
72 pub species: Option<String>,
73
74 #[serde(default, skip_serializing_if = "Vec::is_empty")]
76 pub aliases: Vec<String>,
77
78 #[serde(default, skip_serializing_if = "is_unknown_role")]
80 pub sequence_role: SequenceRole,
81}
82
83#[allow(clippy::trivially_copy_pass_by_ref)] fn is_unknown_role(role: &SequenceRole) -> bool {
85 matches!(role, SequenceRole::Unknown)
86}
87
88impl Contig {
89 pub fn new(name: impl Into<String>, length: u64) -> Self {
90 Self {
91 name: name.into(),
92 length,
93 md5: None,
94 sha512t24u: None,
95 assembly: None,
96 uri: None,
97 species: None,
98 aliases: Vec::new(),
99 sequence_role: SequenceRole::Unknown,
100 }
101 }
102
103 #[cfg(test)]
104 #[must_use]
105 pub fn with_md5(mut self, md5: impl Into<String>) -> Self {
106 self.md5 = Some(md5.into());
107 self
108 }
109
110 #[cfg(test)]
111 #[must_use]
112 pub fn with_aliases(mut self, aliases: Vec<String>) -> Self {
113 self.aliases = aliases;
114 self
115 }
116
117 #[must_use]
120 pub fn is_primary_chromosome(&self) -> bool {
121 matches!(
124 self.name.as_str(),
125 "1" | "2"
126 | "3"
127 | "4"
128 | "5"
129 | "6"
130 | "7"
131 | "8"
132 | "9"
133 | "10"
134 | "11"
135 | "12"
136 | "13"
137 | "14"
138 | "15"
139 | "16"
140 | "17"
141 | "18"
142 | "19"
143 | "20"
144 | "21"
145 | "22"
146 | "X"
147 | "Y"
148 | "chr1"
149 | "chr2"
150 | "chr3"
151 | "chr4"
152 | "chr5"
153 | "chr6"
154 | "chr7"
155 | "chr8"
156 | "chr9"
157 | "chr10"
158 | "chr11"
159 | "chr12"
160 | "chr13"
161 | "chr14"
162 | "chr15"
163 | "chr16"
164 | "chr17"
165 | "chr18"
166 | "chr19"
167 | "chr20"
168 | "chr21"
169 | "chr22"
170 | "chrX"
171 | "chrY"
172 )
173 }
174
175 #[must_use]
178 pub fn is_mitochondrial(&self) -> bool {
179 let name_lower = self.name.to_lowercase();
180 matches!(
181 name_lower.as_str(),
182 "mt" | "m" | "chrm" | "chrmt" | "mito" | "mitochondrion" | "rcrs" | "nc_012920.1"
183 ) || name_lower.contains("mitochon")
184 }
185
186 #[must_use]
188 pub fn is_alt(&self) -> bool {
189 self.name.ends_with("_alt") || self.name.contains("_alt_")
190 }
191
192 #[must_use]
194 pub fn is_decoy(&self) -> bool {
195 self.name.contains("decoy")
196 || self.name == "hs37d5"
197 || self.name.starts_with("chrUn_")
198 || self.name.contains("_random")
199 }
200}
201
202#[must_use]
208pub fn detect_naming_convention(contigs: &[Contig]) -> NamingConvention {
209 let mut has_chr_prefix = false;
210 let mut has_no_prefix = false;
211
212 for contig in contigs {
213 if contig.is_primary_chromosome() {
214 if contig.name.starts_with("chr") {
215 has_chr_prefix = true;
216 } else {
217 has_no_prefix = true;
218 }
219 }
220 }
221
222 match (has_chr_prefix, has_no_prefix) {
223 (true, false) => NamingConvention::Ucsc,
224 (false, true) => NamingConvention::Ncbi,
225 _ => NamingConvention::Mixed,
226 }
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 #[test]
234 fn test_is_primary_chromosome() {
235 assert!(Contig::new("chr1", 100).is_primary_chromosome());
236 assert!(Contig::new("1", 100).is_primary_chromosome());
237 assert!(Contig::new("chrX", 100).is_primary_chromosome());
238 assert!(Contig::new("Y", 100).is_primary_chromosome());
239 assert!(!Contig::new("chrM", 100).is_primary_chromosome());
240 assert!(!Contig::new("chr1_random", 100).is_primary_chromosome());
241 }
242
243 #[test]
244 fn test_is_mitochondrial() {
245 assert!(Contig::new("chrM", 100).is_mitochondrial());
247 assert!(Contig::new("MT", 100).is_mitochondrial());
248 assert!(Contig::new("chrMT", 100).is_mitochondrial());
249 assert!(Contig::new("M", 100).is_mitochondrial());
250 assert!(Contig::new("mito", 100).is_mitochondrial());
252 assert!(Contig::new("Mitochondrion", 100).is_mitochondrial());
253 assert!(Contig::new("rCRS", 100).is_mitochondrial());
254 assert!(Contig::new("NC_012920.1", 100).is_mitochondrial());
255 assert!(Contig::new("mitochondrial_genome", 100).is_mitochondrial());
257 assert!(!Contig::new("chr1", 100).is_mitochondrial());
259 assert!(!Contig::new("chrX", 100).is_mitochondrial());
260 }
261}