prole/gtdb/
metadata_r214.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::io::{BufRead, BufReader};
4use std::path::Path;
5
6use crate::error::{ProleError, ProleResult};
7use crate::genome::genome_id::GenomeId;
8use crate::gtdb::taxonomy::Taxonomy;
9
10// fn parse_float<T: std::str::FromStr<Err=ParseFloatError>>(value: &str) -> ProleResult<T> {
11//     value.parse().map_err(ProleError::ParseFloatError)
12// }
13//
14// fn parse_opt_float<T: std::str::FromStr<Err=ParseFloatError>>(value: &str) -> ProleResult<Option<T>> {
15//     if value == "none" {
16//         Ok(None)
17//     } else {
18//         value.parse().map(Some).map_err(ProleError::ParseFloatError)
19//     }
20// }
21//
22// fn parse_int<T: std::str::FromStr<Err=ParseIntError>>(value: &str) -> ProleResult<T> {
23//     value.parse().map_err(ProleError::ParseIntError)
24// }
25//
26// fn parse_opt_int<T: std::str::FromStr<Err=ParseIntError>>(value: &str) -> ProleResult<Option<T>> {
27//     if value == "none" {
28//         Ok(None)
29//     } else {
30//         value.parse().map(Some).map_err(ProleError::ParseIntError)
31//     }
32// }
33//
34// fn parse_opt_string(value: &str) -> Option<String> {
35//     if value == "none" {
36//         None
37//     } else {
38//         Some(value.to_string())
39//     }
40// }
41
42/// A row within the [GtdbMetadataR214] file.
43pub struct GtdbMetadataR214Row {
44    pub accession: GenomeId,
45    // pub ambiguous_bases: usize,
46    // pub checkm_completeness: f32,
47    // pub checkm_contamination: f32,
48    // pub checkm_marker_count: usize,
49    // pub checkm_marker_lineage: String,
50    // pub checkm_marker_set_count: usize,
51    // pub checkm_strain_heterogeneity: f32,
52    // pub coding_bases: usize,
53    // pub coding_density: f64,
54    // pub contig_count: usize,
55    // pub gc_count: usize,
56    // pub gc_percentage: f64,
57    // pub genome_size: usize,
58    // pub gtdb_genome_representative: GenomeId,
59    pub gtdb_representative: bool,
60    pub gtdb_taxonomy: Taxonomy,
61    // pub gtdb_type_designation_ncbi_taxa: String,
62    // pub gtdb_type_designation_ncbi_taxa_sources: Option<String>,
63    // pub gtdb_type_species_of_genus: bool,
64    // pub l50_contigs: usize,
65    // pub l50_scaffolds: usize,
66    // pub longest_contig: usize,
67    // pub longest_scaffold: usize,
68    // pub lsu_23s_contig_len: Option<usize>,
69    // pub lsu_23s_count: usize,
70    // pub lsu_23s_length: Option<usize>,
71    // pub lsu_23s_query_id: Option<String>,
72    // pub lsu_5s_contig_len: Option<usize>,
73    // pub lsu_5s_count: usize,
74    // pub lsu_5s_length: Option<usize>,
75    // pub lsu_5s_query_id: Option<String>,
76    // pub lsu_silva_23s_blast_align_len: Option<usize>,
77    // pub lsu_silva_23s_blast_bitscore: Option<usize>,
78    // pub lsu_silva_23s_blast_evalue: Option<f64>,
79    // pub lsu_silva_23s_blast_perc_identity: Option<f64>,
80    // pub lsu_silva_23s_blast_subject_id: Option<String>,
81    // pub lsu_silva_23s_taxonomy: Option<String>,
82    // pub mean_contig_length: usize,
83    // pub mean_scaffold_length: usize,
84    // pub mimag_high_quality: bool,
85    // pub mimag_low_quality: bool,
86    // pub mimag_medium_quality: bool,
87    // pub n50_contigs: usize,
88    // pub n50_scaffolds: usize,
89    // pub ncbi_assembly_level: String,
90    // pub ncbi_assembly_name: String,
91    // pub ncbi_assembly_type: Option<String>,
92    // pub ncbi_bioproject: String,
93    // pub ncbi_biosample: String,
94    // pub ncbi_contig_count: Option<usize>,
95    // pub ncbi_contig_n50: Option<usize>,
96    // pub ncbi_country: Option<String>,
97    // pub ncbi_date: String,
98    // pub ncbi_genbank_assembly_accession: GenomeId,
99    // pub ncbi_genome_category: Option<String>,
100    // pub ncbi_genome_representation: String,
101    // pub ncbi_isolate: Option<String>,
102    // pub ncbi_isolation_source: Option<String>,
103    // pub ncbi_lat_lon: Option<String>,
104    // pub ncbi_molecule_count: usize,
105    // pub ncbi_ncrna_count: usize,
106    // pub ncbi_organism_name: String,
107    // pub ncbi_protein_count: usize,
108    // pub ncbi_refseq_category: Option<String>,
109    // pub ncbi_rrna_count: usize,
110    // pub ncbi_scaffold_count: Option<usize>,
111    // pub ncbi_scaffold_l50: Option<usize>,
112    // pub ncbi_scaffold_n50: Option<usize>,
113    // pub ncbi_scaffold_n75: Option<usize>,
114    // pub ncbi_scaffold_n90: Option<usize>,
115    // pub ncbi_seq_rel_date: String,
116    // pub ncbi_spanned_gaps: usize,
117    // pub ncbi_species_taxid: usize,
118    // pub ncbi_ssu_count: usize,
119    // pub ncbi_strain_identifiers: String,
120    // pub ncbi_submitter: String,
121    // pub ncbi_taxid: usize,
122    // pub ncbi_taxonomy: String,
123    // pub ncbi_taxonomy_unfiltered: String,
124    // pub ncbi_total_gap_length: usize,
125    // pub ncbi_total_length: usize,
126    // pub ncbi_translation_table: usize,
127    // pub ncbi_trna_count: usize,
128    // pub ncbi_type_material_designation: Option<String>,
129    // pub ncbi_ungapped_length: usize,
130    // pub ncbi_unspanned_gaps: usize,
131    // pub ncbi_wgs_master: Option<String>,
132    // pub protein_count: usize,
133    // pub scaffold_count: usize,
134    // pub ssu_contig_len: usize,
135    // pub ssu_count: usize,
136    // pub ssu_gg_blast_align_len: Option<usize>,
137    // pub ssu_gg_blast_bitscore: Option<usize>,
138    // pub ssu_gg_blast_evalue: Option<f64>,
139    // pub ssu_gg_blast_perc_identity: Option<f64>,
140    // pub ssu_gg_blast_subject_id: Option<usize>,
141    // pub ssu_gg_taxonomy: Option<String>,
142    // pub ssu_length: Option<usize>,
143    // pub ssu_query_id: Option<String>,
144    // pub ssu_silva_blast_align_len: Option<usize>,
145    // pub ssu_silva_blast_bitscore: Option<usize>,
146    // pub ssu_silva_blast_evalue: Option<usize>,
147    // pub ssu_silva_blast_perc_identity: Option<usize>,
148    // pub ssu_silva_blast_subject_id: Option<String>,
149    // pub ssu_silva_taxonomy: Option<String>,
150    // pub total_gap_length: usize,
151    // pub trna_aa_count: usize,
152    // pub trna_count: usize,
153    // pub trna_selenocysteine_count: usize,
154}
155
156impl GtdbMetadataR214Row {
157    pub fn from_string(string: &str) -> ProleResult<Self> {
158        let split = string.split('\t').collect::<Vec<&str>>();
159        if split.len() != 110 {
160            return Err(ProleError::Exit(format!("Expected 110 columns, got {}", split.len())));
161        }
162        let out = Self {
163            accession: GenomeId(split[0].to_string()),
164            // ambiguous_bases: parse_int(split[1])?,
165            // checkm_completeness: parse_float(split[2])?,
166            // checkm_contamination: parse_float(split[3])?,
167            // checkm_marker_count: parse_int(split[4])?,
168            // checkm_marker_lineage: split[5].to_string(),
169            // checkm_marker_set_count: parse_int(split[6])?,
170            // checkm_strain_heterogeneity: parse_float(split[7])?,
171            // coding_bases: parse_int(split[8])?,
172            // coding_density: parse_float(split[9])?,
173            // contig_count: parse_int(split[10])?,
174            // gc_count: parse_int(split[11])?,
175            // gc_percentage: parse_float(split[12])?,
176            // genome_size: parse_int(split[13])?,
177            // gtdb_genome_representative: GenomeId(split[14].to_string()),
178            gtdb_representative: split[15] == "t",
179            gtdb_taxonomy: Taxonomy::from_string(split[16])?,
180            // gtdb_type_designation_ncbi_taxa: split[17].to_string(),
181            // gtdb_type_designation_ncbi_taxa_sources: parse_opt_string(split[18]),
182            // gtdb_type_species_of_genus: split[19] == "t",
183            // l50_contigs: parse_int(split[20])?,
184            // l50_scaffolds: parse_int(split[21])?,
185            // longest_contig: parse_int(split[22])?,
186            // longest_scaffold: parse_int(split[23])?,
187            // lsu_23s_contig_len: parse_opt_int(split[24])?,
188            // lsu_23s_count: parse_int(split[25])?,
189            // lsu_23s_length: parse_opt_int(split[26])?,
190            // lsu_23s_query_id: parse_opt_string(split[27]),
191            // lsu_5s_contig_len: parse_opt_int(split[28])?,
192            // lsu_5s_count: parse_int(split[29])?,
193            // lsu_5s_length: parse_opt_int(split[30])?,
194            // lsu_5s_query_id: parse_opt_string(split[31]),
195            // lsu_silva_23s_blast_align_len: parse_opt_int(split[32])?,
196            // lsu_silva_23s_blast_bitscore: parse_opt_int(split[33])?,
197            // lsu_silva_23s_blast_evalue: parse_opt_float(split[34])?,
198            // lsu_silva_23s_blast_perc_identity: parse_opt_float(split[35])?,
199            // lsu_silva_23s_blast_subject_id: parse_opt_string(split[36]),
200            // lsu_silva_23s_taxonomy: parse_opt_string(split[37]),
201            // mean_contig_length: parse_int(split[38])?,
202            // mean_scaffold_length: parse_int(split[39])?,
203            // mimag_high_quality: split[40] == "t",
204            // mimag_low_quality: split[41] == "t",
205            // mimag_medium_quality: split[42] == "t",
206            // n50_contigs: parse_int(split[43])?,
207            // n50_scaffolds: parse_int(split[44])?,
208            // ncbi_assembly_level: split[45].to_string(),
209            // ncbi_assembly_name: split[46].to_string(),
210            // ncbi_assembly_type: parse_opt_string(split[47]),
211            // ncbi_bioproject: split[48].to_string(),
212            // ncbi_biosample: split[49].to_string(),
213            // ncbi_contig_count: parse_opt_int(split[50])?,
214            // ncbi_contig_n50: parse_opt_int(split[51])?,
215            // ncbi_country: parse_opt_string(split[52]),
216            // ncbi_date: split[53].to_string(),
217            // ncbi_genbank_assembly_accession: GenomeId(split[54].to_string()),
218            // ncbi_genome_category: parse_opt_string(split[55]),
219            // ncbi_genome_representation: split[56].to_string(),
220            // ncbi_isolate: parse_opt_string(split[57]),
221            // ncbi_isolation_source: parse_opt_string(split[58]),
222            // ncbi_lat_lon: parse_opt_string(split[59]),
223            // ncbi_molecule_count: parse_int(split[60])?,
224            // ncbi_ncrna_count: parse_int(split[61])?,
225            // ncbi_organism_name: split[62].to_string(),
226            // ncbi_protein_count: parse_int(split[63])?,
227            // ncbi_refseq_category: parse_opt_string(split[64]),
228            // ncbi_rrna_count: parse_int(split[65])?,
229            // ncbi_scaffold_count: parse_opt_int(split[66])?,
230            // ncbi_scaffold_l50: parse_opt_int(split[67])?,
231            // ncbi_scaffold_n50: parse_opt_int(split[68])?,
232            // ncbi_scaffold_n75: parse_opt_int(split[69])?,
233            // ncbi_scaffold_n90: parse_opt_int(split[70])?,
234            // ncbi_seq_rel_date: split[71].to_string(),
235            // ncbi_spanned_gaps: parse_int(split[72])?,
236            // ncbi_species_taxid: parse_int(split[73])?,
237            // ncbi_ssu_count: parse_int(split[74])?,
238            // ncbi_strain_identifiers: split[75].to_string(),
239            // ncbi_submitter: split[76].to_string(),
240            // ncbi_taxid: parse_int(split[77])?,
241            // ncbi_taxonomy: split[78].to_string(),
242            // ncbi_taxonomy_unfiltered: split[79].to_string(),
243            // ncbi_total_gap_length: parse_int(split[80])?,
244            // ncbi_total_length: parse_int(split[81])?,
245            // ncbi_translation_table: parse_int(split[82])?,
246            // ncbi_trna_count: parse_int(split[83])?,
247            // ncbi_type_material_designation: parse_opt_string(split[84]),
248            // ncbi_ungapped_length: parse_int(split[85])?,
249            // ncbi_unspanned_gaps: parse_int(split[86])?,
250            // ncbi_wgs_master: parse_opt_string(split[87]),
251            // protein_count: parse_int(split[88])?,
252            // scaffold_count: parse_int(split[89])?,
253            // ssu_contig_len: parse_int(split[90])?,
254            // ssu_count: parse_int(split[91])?,
255            // ssu_gg_blast_align_len: parse_opt_int(split[92])?,
256            // ssu_gg_blast_bitscore: parse_opt_int(split[93])?,
257            // ssu_gg_blast_evalue: parse_opt_float(split[94])?,
258            // ssu_gg_blast_perc_identity: parse_opt_float(split[95])?,
259            // ssu_gg_blast_subject_id: parse_opt_int(split[96])?,
260            // ssu_gg_taxonomy: parse_opt_string(split[97]),
261            // ssu_length: parse_opt_int(split[98])?,
262            // ssu_query_id: parse_opt_string(split[99]),
263            // ssu_silva_blast_align_len: parse_opt_int(split[100])?,
264            // ssu_silva_blast_bitscore: parse_opt_int(split[101])?,
265            // ssu_silva_blast_evalue: parse_opt_int(split[102])?,
266            // ssu_silva_blast_perc_identity: parse_opt_int(split[103])?,
267            // ssu_silva_blast_subject_id: parse_opt_string(split[104]),
268            // ssu_silva_taxonomy: parse_opt_string(split[105]),
269            // total_gap_length: parse_int(split[106])?,
270            // trna_aa_count: parse_int(split[107])?,
271            // trna_count: parse_int(split[108])?,
272            // trna_selenocysteine_count: parse_int(split[109])?,
273        };
274        Ok(out)
275    }
276}
277
278
279/// The GTDB R214 metadata file.
280pub struct GtdbMetadataR214 {
281    pub rows: HashMap<GenomeId, GtdbMetadataR214Row>,
282}
283
284impl GtdbMetadataR214 {
285    pub fn from_bufreader<T: std::io::Read>(buf: BufReader<T>) -> ProleResult<Self> {
286        let mut out: HashMap<GenomeId, GtdbMetadataR214Row> = HashMap::new();
287        for line in buf.lines() {
288            let line = line.map_err(ProleError::IoError)?;
289            if line.starts_with("accession\tambiguous_bases") || line.is_empty() {
290                continue;
291            }
292            let row = GtdbMetadataR214Row::from_string(&line)?;
293            out.insert(row.accession.clone(), row);
294        }
295        Ok(Self {
296            rows: out
297        })
298    }
299
300    pub fn from_path(path: &Path) -> ProleResult<Self> {
301        let file = File::open(path).map_err(ProleError::IoError)?;
302        let reader = BufReader::new(file);
303        Self::from_bufreader(reader)
304    }
305}
306
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_from_string() {
314        let result = GtdbMetadataR214Row::from_string("RS_GCF_000246985.2\t44\t99.5\t0.5\t299\tp__Euryarchaeota (UID4)\t202\t0\t2014456\t90.93903317665627\t1\t954455\t43.08802922449628\t2215172\tRS_GCF_024054535.1\tf\td__Archaea;p__Methanobacteriota_B;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus_A;s__Thermococcus_A alcaliphilus\ttype strain of species\tLPSN\tf\t1\t1\t2215172\t2215172\t2215172\t1\t3020\tNC_022084.1\t2215172\t2\t103\tNC_022084.1\t3020\t5561\t0\t99.901\tAKID01000054.18410.21433\tArchaea;Euryarchaeota;Thermococci;Thermococcales;Thermococcaceae;Thermococcus;Thermococcus sp. PK\t2215172\t2215172\tt\tf\tf\t2215172\t2215172\tComplete Genome\tASM24698v3\tna\tPRJNA224116\tSAMN02603679\tnone\tnone\tnone\t2013-08-13\tGCA_000246985.3\tnone\tfull\tnone\tnone\tnone\t1\t0\tThermococcus litoralis DSM 5473\t2402\trepresentative genome\t4\t1\t1\t2215172\t2215172\t2215172\t2013/08/13\t0\t2265\t1\tDSM 5473\tNew England Biolabs, Inc.\t523849\td__Archaea;p__Euryarchaeota;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus;s__Thermococcus litoralis\td__Archaea;p__Euryarchaeota;c__Thermococci;o__Thermococcales;f__Thermococcaceae;g__Thermococcus;s__Thermococcus litoralis;x__Thermococcus litoralis DSM 5473\t0\t2215172\t11\t46\tassembly from type material\t2215172\t0\tnone\t2497\t1\t2215172\t1\tnone\tnone\tnone\tnone\tnone\tnone\t1485\tNC_022084.1\t1485\t2743\t0\t100\tCP006670.774259.775759\tArchaea;Euryarchaeota;Thermococci;Thermococcales;Thermococcaceae;Thermococcus;Thermococcus litoralis DSM 5473\t0\t19\t45\t0");
315        assert!(result.is_ok());
316    }
317}