prole/gtdb/metadata_r214.rs
1use std::collections::HashMap;
2use std::fs::File;
3use std::io::{BufRead, BufReader};
4use std::path::Path;
5
6use crate::error::{ProleError, ProleResult};
7use crate::genome::genome_id::GenomeId;
8use crate::gtdb::taxonomy::Taxonomy;
9
10// fn parse_float<T: std::str::FromStr<Err=ParseFloatError>>(value: &str) -> ProleResult<T> {
11// value.parse().map_err(ProleError::ParseFloatError)
12// }
13//
14// fn parse_opt_float<T: std::str::FromStr<Err=ParseFloatError>>(value: &str) -> ProleResult<Option<T>> {
15// if value == "none" {
16// Ok(None)
17// } else {
18// value.parse().map(Some).map_err(ProleError::ParseFloatError)
19// }
20// }
21//
22// fn parse_int<T: std::str::FromStr<Err=ParseIntError>>(value: &str) -> ProleResult<T> {
23// value.parse().map_err(ProleError::ParseIntError)
24// }
25//
26// fn parse_opt_int<T: std::str::FromStr<Err=ParseIntError>>(value: &str) -> ProleResult<Option<T>> {
27// if value == "none" {
28// Ok(None)
29// } else {
30// value.parse().map(Some).map_err(ProleError::ParseIntError)
31// }
32// }
33//
34// fn parse_opt_string(value: &str) -> Option<String> {
35// if value == "none" {
36// None
37// } else {
38// Some(value.to_string())
39// }
40// }
41
42/// A row within the [GtdbMetadataR214] file.
43pub struct GtdbMetadataR214Row {
44 pub accession: GenomeId,
45 // pub ambiguous_bases: usize,
46 // pub checkm_completeness: f32,
47 // pub checkm_contamination: f32,
48 // pub checkm_marker_count: usize,
49 // pub checkm_marker_lineage: String,
50 // pub checkm_marker_set_count: usize,
51 // pub checkm_strain_heterogeneity: f32,
52 // pub coding_bases: usize,
53 // pub coding_density: f64,
54 // pub contig_count: usize,
55 // pub gc_count: usize,
56 // pub gc_percentage: f64,
57 // pub genome_size: usize,
58 // pub gtdb_genome_representative: GenomeId,
59 pub gtdb_representative: bool,
60 pub gtdb_taxonomy: Taxonomy,
61 // pub gtdb_type_designation_ncbi_taxa: String,
62 // pub gtdb_type_designation_ncbi_taxa_sources: Option<String>,
63 // pub gtdb_type_species_of_genus: bool,
64 // pub l50_contigs: usize,
65 // pub l50_scaffolds: usize,
66 // pub longest_contig: usize,
67 // pub longest_scaffold: usize,
68 // pub lsu_23s_contig_len: Option<usize>,
69 // pub lsu_23s_count: usize,
70 // pub lsu_23s_length: Option<usize>,
71 // pub lsu_23s_query_id: Option<String>,
72 // pub lsu_5s_contig_len: Option<usize>,
73 // pub lsu_5s_count: usize,
74 // pub lsu_5s_length: Option<usize>,
75 // pub lsu_5s_query_id: Option<String>,
76 // pub lsu_silva_23s_blast_align_len: Option<usize>,
77 // pub lsu_silva_23s_blast_bitscore: Option<usize>,
78 // pub lsu_silva_23s_blast_evalue: Option<f64>,
79 // pub lsu_silva_23s_blast_perc_identity: Option<f64>,
80 // pub lsu_silva_23s_blast_subject_id: Option<String>,
81 // pub lsu_silva_23s_taxonomy: Option<String>,
82 // pub mean_contig_length: usize,
83 // pub mean_scaffold_length: usize,
84 // pub mimag_high_quality: bool,
85 // pub mimag_low_quality: bool,
86 // pub mimag_medium_quality: bool,
87 // pub n50_contigs: usize,
88 // pub n50_scaffolds: usize,
89 // pub ncbi_assembly_level: String,
90 // pub ncbi_assembly_name: String,
91 // pub ncbi_assembly_type: Option<String>,
92 // pub ncbi_bioproject: String,
93 // pub ncbi_biosample: String,
94 // pub ncbi_contig_count: Option<usize>,
95 // pub ncbi_contig_n50: Option<usize>,
96 // pub ncbi_country: Option<String>,
97 // pub ncbi_date: String,
98 // pub ncbi_genbank_assembly_accession: GenomeId,
99 // pub ncbi_genome_category: Option<String>,
100 // pub ncbi_genome_representation: String,
101 // pub ncbi_isolate: Option<String>,
102 // pub ncbi_isolation_source: Option<String>,
103 // pub ncbi_lat_lon: Option<String>,
104 // pub ncbi_molecule_count: usize,
105 // pub ncbi_ncrna_count: usize,
106 // pub ncbi_organism_name: String,
107 // pub ncbi_protein_count: usize,
108 // pub ncbi_refseq_category: Option<String>,
109 // pub ncbi_rrna_count: usize,
110 // pub ncbi_scaffold_count: Option<usize>,
111 // pub ncbi_scaffold_l50: Option<usize>,
112 // pub ncbi_scaffold_n50: Option<usize>,
113 // pub ncbi_scaffold_n75: Option<usize>,
114 // pub ncbi_scaffold_n90: Option<usize>,
115 // pub ncbi_seq_rel_date: String,
116 // pub ncbi_spanned_gaps: usize,
117 // pub ncbi_species_taxid: usize,
118 // pub ncbi_ssu_count: usize,
119 // pub ncbi_strain_identifiers: String,
120 // pub ncbi_submitter: String,
121 // pub ncbi_taxid: usize,
122 // pub ncbi_taxonomy: String,
123 // pub ncbi_taxonomy_unfiltered: String,
124 // pub ncbi_total_gap_length: usize,
125 // pub ncbi_total_length: usize,
126 // pub ncbi_translation_table: usize,
127 // pub ncbi_trna_count: usize,
128 // pub ncbi_type_material_designation: Option<String>,
129 // pub ncbi_ungapped_length: usize,
130 // pub ncbi_unspanned_gaps: usize,
131 // pub ncbi_wgs_master: Option<String>,
132 // pub protein_count: usize,
133 // pub scaffold_count: usize,
134 // pub ssu_contig_len: usize,
135 // pub ssu_count: usize,
136 // pub ssu_gg_blast_align_len: Option<usize>,
137 // pub ssu_gg_blast_bitscore: Option<usize>,
138 // pub ssu_gg_blast_evalue: Option<f64>,
139 // pub ssu_gg_blast_perc_identity: Option<f64>,
140 // pub ssu_gg_blast_subject_id: Option<usize>,
141 // pub ssu_gg_taxonomy: Option<String>,
142 // pub ssu_length: Option<usize>,
143 // pub ssu_query_id: Option<String>,
144 // pub ssu_silva_blast_align_len: Option<usize>,
145 // pub ssu_silva_blast_bitscore: Option<usize>,
146 // pub ssu_silva_blast_evalue: Option<usize>,
147 // pub ssu_silva_blast_perc_identity: Option<usize>,
148 // pub ssu_silva_blast_subject_id: Option<String>,
149 // pub ssu_silva_taxonomy: Option<String>,
150 // pub total_gap_length: usize,
151 // pub trna_aa_count: usize,
152 // pub trna_count: usize,
153 // pub trna_selenocysteine_count: usize,
154}
155
156impl GtdbMetadataR214Row {
157 pub fn from_string(string: &str) -> ProleResult<Self> {
158 let split = string.split('\t').collect::<Vec<&str>>();
159 if split.len() != 110 {
160 return Err(ProleError::Exit(format!("Expected 110 columns, got {}", split.len())));
161 }
162 let out = Self {
163 accession: GenomeId(split[0].to_string()),
164 // ambiguous_bases: parse_int(split[1])?,
165 // checkm_completeness: parse_float(split[2])?,
166 // checkm_contamination: parse_float(split[3])?,
167 // checkm_marker_count: parse_int(split[4])?,
168 // checkm_marker_lineage: split[5].to_string(),
169 // checkm_marker_set_count: parse_int(split[6])?,
170 // checkm_strain_heterogeneity: parse_float(split[7])?,
171 // coding_bases: parse_int(split[8])?,
172 // coding_density: parse_float(split[9])?,
173 // contig_count: parse_int(split[10])?,
174 // gc_count: parse_int(split[11])?,
175 // gc_percentage: parse_float(split[12])?,
176 // genome_size: parse_int(split[13])?,
177 // gtdb_genome_representative: GenomeId(split[14].to_string()),
178 gtdb_representative: split[15] == "t",
179 gtdb_taxonomy: Taxonomy::from_string(split[16])?,
180 // gtdb_type_designation_ncbi_taxa: split[17].to_string(),
181 // gtdb_type_designation_ncbi_taxa_sources: parse_opt_string(split[18]),
182 // gtdb_type_species_of_genus: split[19] == "t",
183 // l50_contigs: parse_int(split[20])?,
184 // l50_scaffolds: parse_int(split[21])?,
185 // longest_contig: parse_int(split[22])?,
186 // longest_scaffold: parse_int(split[23])?,
187 // lsu_23s_contig_len: parse_opt_int(split[24])?,
188 // lsu_23s_count: parse_int(split[25])?,
189 // lsu_23s_length: parse_opt_int(split[26])?,
190 // lsu_23s_query_id: parse_opt_string(split[27]),
191 // lsu_5s_contig_len: parse_opt_int(split[28])?,
192 // lsu_5s_count: parse_int(split[29])?,
193 // lsu_5s_length: parse_opt_int(split[30])?,
194 // lsu_5s_query_id: parse_opt_string(split[31]),
195 // lsu_silva_23s_blast_align_len: parse_opt_int(split[32])?,
196 // lsu_silva_23s_blast_bitscore: parse_opt_int(split[33])?,
197 // lsu_silva_23s_blast_evalue: parse_opt_float(split[34])?,
198 // lsu_silva_23s_blast_perc_identity: parse_opt_float(split[35])?,
199 // lsu_silva_23s_blast_subject_id: parse_opt_string(split[36]),
200 // lsu_silva_23s_taxonomy: parse_opt_string(split[37]),
201 // mean_contig_length: parse_int(split[38])?,
202 // mean_scaffold_length: parse_int(split[39])?,
203 // mimag_high_quality: split[40] == "t",
204 // mimag_low_quality: split[41] == "t",
205 // mimag_medium_quality: split[42] == "t",
206 // n50_contigs: parse_int(split[43])?,
207 // n50_scaffolds: parse_int(split[44])?,
208 // ncbi_assembly_level: split[45].to_string(),
209 // ncbi_assembly_name: split[46].to_string(),
210 // ncbi_assembly_type: parse_opt_string(split[47]),
211 // ncbi_bioproject: split[48].to_string(),
212 // ncbi_biosample: split[49].to_string(),
213 // ncbi_contig_count: parse_opt_int(split[50])?,
214 // ncbi_contig_n50: parse_opt_int(split[51])?,
215 // ncbi_country: parse_opt_string(split[52]),
216 // ncbi_date: split[53].to_string(),
217 // ncbi_genbank_assembly_accession: GenomeId(split[54].to_string()),
218 // ncbi_genome_category: parse_opt_string(split[55]),
219 // ncbi_genome_representation: split[56].to_string(),
220 // ncbi_isolate: parse_opt_string(split[57]),
221 // ncbi_isolation_source: parse_opt_string(split[58]),
222 // ncbi_lat_lon: parse_opt_string(split[59]),
223 // ncbi_molecule_count: parse_int(split[60])?,
224 // ncbi_ncrna_count: parse_int(split[61])?,
225 // ncbi_organism_name: split[62].to_string(),
226 // ncbi_protein_count: parse_int(split[63])?,
227 // ncbi_refseq_category: parse_opt_string(split[64]),
228 // ncbi_rrna_count: parse_int(split[65])?,
229 // ncbi_scaffold_count: parse_opt_int(split[66])?,
230 // ncbi_scaffold_l50: parse_opt_int(split[67])?,
231 // ncbi_scaffold_n50: parse_opt_int(split[68])?,
232 // ncbi_scaffold_n75: parse_opt_int(split[69])?,
233 // ncbi_scaffold_n90: parse_opt_int(split[70])?,
234 // ncbi_seq_rel_date: split[71].to_string(),
235 // ncbi_spanned_gaps: parse_int(split[72])?,
236 // ncbi_species_taxid: parse_int(split[73])?,
237 // ncbi_ssu_count: parse_int(split[74])?,
238 // ncbi_strain_identifiers: split[75].to_string(),
239 // ncbi_submitter: split[76].to_string(),
240 // ncbi_taxid: parse_int(split[77])?,
241 // ncbi_taxonomy: split[78].to_string(),
242 // ncbi_taxonomy_unfiltered: split[79].to_string(),
243 // ncbi_total_gap_length: parse_int(split[80])?,
244 // ncbi_total_length: parse_int(split[81])?,
245 // ncbi_translation_table: parse_int(split[82])?,
246 // ncbi_trna_count: parse_int(split[83])?,
247 // ncbi_type_material_designation: parse_opt_string(split[84]),
248 // ncbi_ungapped_length: parse_int(split[85])?,
249 // ncbi_unspanned_gaps: parse_int(split[86])?,
250 // ncbi_wgs_master: parse_opt_string(split[87]),
251 // protein_count: parse_int(split[88])?,
252 // scaffold_count: parse_int(split[89])?,
253 // ssu_contig_len: parse_int(split[90])?,
254 // ssu_count: parse_int(split[91])?,
255 // ssu_gg_blast_align_len: parse_opt_int(split[92])?,
256 // ssu_gg_blast_bitscore: parse_opt_int(split[93])?,
257 // ssu_gg_blast_evalue: parse_opt_float(split[94])?,
258 // ssu_gg_blast_perc_identity: parse_opt_float(split[95])?,
259 // ssu_gg_blast_subject_id: parse_opt_int(split[96])?,
260 // ssu_gg_taxonomy: parse_opt_string(split[97]),
261 // ssu_length: parse_opt_int(split[98])?,
262 // ssu_query_id: parse_opt_string(split[99]),
263 // ssu_silva_blast_align_len: parse_opt_int(split[100])?,
264 // ssu_silva_blast_bitscore: parse_opt_int(split[101])?,
265 // ssu_silva_blast_evalue: parse_opt_int(split[102])?,
266 // ssu_silva_blast_perc_identity: parse_opt_int(split[103])?,
267 // ssu_silva_blast_subject_id: parse_opt_string(split[104]),
268 // ssu_silva_taxonomy: parse_opt_string(split[105]),
269 // total_gap_length: parse_int(split[106])?,
270 // trna_aa_count: parse_int(split[107])?,
271 // trna_count: parse_int(split[108])?,
272 // trna_selenocysteine_count: parse_int(split[109])?,
273 };
274 Ok(out)
275 }
276}
277
278
279/// The GTDB R214 metadata file.
280pub struct GtdbMetadataR214 {
281 pub rows: HashMap<GenomeId, GtdbMetadataR214Row>,
282}
283
284impl GtdbMetadataR214 {
285 pub fn from_bufreader<T: std::io::Read>(buf: BufReader<T>) -> ProleResult<Self> {
286 let mut out: HashMap<GenomeId, GtdbMetadataR214Row> = HashMap::new();
287 for line in buf.lines() {
288 let line = line.map_err(ProleError::IoError)?;
289 if line.starts_with("accession\tambiguous_bases") || line.is_empty() {
290 continue;
291 }
292 let row = GtdbMetadataR214Row::from_string(&line)?;
293 out.insert(row.accession.clone(), row);
294 }
295 Ok(Self {
296 rows: out
297 })
298 }
299
300 pub fn from_path(path: &Path) -> ProleResult<Self> {
301 let file = File::open(path).map_err(ProleError::IoError)?;
302 let reader = BufReader::new(file);
303 Self::from_bufreader(reader)
304 }
305}
306
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn test_from_string() {
314 let result = GtdbMetadataR214Row::from_string("RS_GCF_000246985.2\t44\t99.5\t0.5\t299\tp__Euryarchaeota (UID4)\t202\t0\t2014456\t90.93903317665627\t1\t954455\t43.08802922449628\t2215172\tRS_GCF_024054535.1\tf\td__Archaea;p__Methanobacteriota_B;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus_A;s__Thermococcus_A alcaliphilus\ttype strain of species\tLPSN\tf\t1\t1\t2215172\t2215172\t2215172\t1\t3020\tNC_022084.1\t2215172\t2\t103\tNC_022084.1\t3020\t5561\t0\t99.901\tAKID01000054.18410.21433\tArchaea;Euryarchaeota;Thermococci;Thermococcales;Thermococcaceae;Thermococcus;Thermococcus sp. PK\t2215172\t2215172\tt\tf\tf\t2215172\t2215172\tComplete Genome\tASM24698v3\tna\tPRJNA224116\tSAMN02603679\tnone\tnone\tnone\t2013-08-13\tGCA_000246985.3\tnone\tfull\tnone\tnone\tnone\t1\t0\tThermococcus litoralis DSM 5473\t2402\trepresentative genome\t4\t1\t1\t2215172\t2215172\t2215172\t2013/08/13\t0\t2265\t1\tDSM 5473\tNew England Biolabs, Inc.\t523849\td__Archaea;p__Euryarchaeota;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus;s__Thermococcus litoralis\td__Archaea;p__Euryarchaeota;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus;s__Thermococcus litoralis;x__Thermococcus litoralis DSM 5473\t0\t2215172\t11\t46\tassembly from type material\t2215172\t0\tnone\t2497\t1\t2215172\t1\tnone\tnone\tnone\tnone\tnone\tnone\t1485\tNC_022084.1\t1485\t2743\t0\t100\tCP006670.774259.775759\tArchaea;Euryarchaeota;Thermococci;Thermococcales;Thermococcaceae;Thermococcus;Thermococcus litoralis DSM 5473\t0\t19\t45\t0");
315 assert!(result.is_ok());
316 }
317}