1use std::collections::hash_map::Entry;
2use std::collections::HashMap;
3use std::ffi::CString;
4use std::io::Write;
5use std::path::PathBuf;
6
7use blart::TreeMap;
8use indicatif::ProgressBar;
9use serde::{Deserialize, Deserializer, Serialize};
10
11pub mod lookup;
13
14pub mod nodes;
16
17pub mod genomehubs;
19
20use crate::error;
21
22use genomehubs::{
23 GHubsConfig, SkipPartial, Source, StringOrVec, ValidationCounts, ValidationStatus,
24};
25use lookup::{
26 clean_name, match_taxonomy_section, Candidate, MatchCounts, MatchStatus, TaxonInfo, TaxonMatch,
27};
28use nodes::{Name, Node, Nodes};
29
30fn add_new_names(
32 taxon: &Candidate,
33 taxon_names: &HashMap<String, String>,
34 names: &mut HashMap<String, Vec<Name>>,
35 id_map: &TreeMap<CString, Vec<TaxonInfo>>,
36 xref_label: &Option<String>,
37) {
38 if taxon.tax_id.is_none() {
39 return;
40 }
41 let tax_id = taxon.tax_id.clone().unwrap();
42 for (name_class, name) in taxon_names.iter() {
43 if name == "None" || name == "NA" || name.is_empty() {
44 continue;
45 }
46 if let Some(tax_info) = id_map.get(&CString::new(clean_name(name)).unwrap()) {
49 let mut found = false;
50 for info in tax_info {
51 if info.tax_id == tax_id {
52 found = true;
53 }
54 }
55 if found {
56 continue;
57 }
58 }
59
60 let unique_name = match xref_label {
61 Some(label) => format!("{}:{}", label, name),
62 None => name.clone(),
63 };
64 let taxon_name = Name {
65 tax_id: tax_id.clone(),
66 name: name.clone(),
67 unique_name,
68 class: Some(name_class.replace('_', " ")),
69 ..Default::default()
70 };
71
72 names.entry(tax_id.clone()).or_default().push(taxon_name);
73 }
74}
75
76fn add_new_taxid(
77 taxon: &TaxonMatch,
78 taxonomy_section: &HashMap<String, String>,
79 _id_map: &TreeMap<CString, Vec<TaxonInfo>>,
80 row_index: Option<usize>,
81 raw_row: Option<String>,
82) -> Option<Node> {
83 let alt_taxon_id;
85 if let Some(alt_id) = taxonomy_section.get("alt_taxon_id") {
86 if alt_id == "None" && alt_id == "NA" {
87 return None;
88 } else {
89 alt_taxon_id = alt_id;
90 }
91 } else {
92 return None;
93 }
94 let mut node = None;
95 if let Some(higher_status) = &taxon.higher_status {
96 if let MatchStatus::PutativeMatch(higher_candidate) = higher_status {
97 node = Some(Node {
99 tax_id: alt_taxon_id.clone(),
100 parent_tax_id: higher_candidate.tax_id.clone().unwrap(),
101 rank: taxon.taxon.rank.clone(),
102 scientific_name: Some(taxon.taxon.name.clone()),
103 names: None,
104 row_index,
105 raw_row,
106 ..Default::default()
107 });
108 }
109 }
110 node
111}
112
113fn nodes_from_file(
115 config_file: &PathBuf,
116 ghubs_config: &mut GHubsConfig,
117 id_map: &TreeMap<CString, Vec<TaxonInfo>>,
118 write_validated: bool,
119 create_taxa: bool,
120 xref_label: Option<String>,
121 skip_tsv: bool,
122) -> Result<(HashMap<String, Vec<Name>>, HashMap<String, Node>), error::Error> {
123 let keys = vec!["attributes", "taxon_names", "taxonomy"];
124 let mut fixed_names = HashMap::new();
125 ghubs_config.init_csv_reader(Some(keys.clone()), skip_tsv)?;
126 ghubs_config.init_file_writers(write_validated, true);
127 if !id_map.is_empty() {
128 ghubs_config.init_taxon_id();
129 fixed_names = ghubs_config.init_taxon_names();
130 }
131
132 let mut names = HashMap::new();
133 let mut nodes = HashMap::new();
134
135 let mut validation_counts: ValidationCounts = ValidationCounts::default();
136 let mut match_counts = MatchCounts::default();
137
138 let pb = ProgressBar::new_spinner();
139
140 for (row_index, result) in ghubs_config
141 .init_csv_reader(None, skip_tsv)?
142 .records()
143 .enumerate()
144 {
145 pb.set_message(format!("[+] {}", validation_counts.to_jsonl().as_str()));
146 pb.inc(1);
147 if let Err(err) = result {
148 let err: error::Error = err.into();
149 ghubs_config.handle_error(&err, row_index);
150 continue;
151 }
152 let record = result?;
153 let raw_row = record.iter().collect::<Vec<_>>().join("\t");
154 let (mut processed, mut combined_report) =
155 ghubs_config.validate_record(&record, row_index, &keys);
156 validation_counts.update(&combined_report.counts);
157 if combined_report.status == ValidationStatus::Partial
158 && ghubs_config.file.as_ref().unwrap().skip_partial == Some(SkipPartial::Row)
159 {
160 continue;
161 }
162
163 let taxonomy_section = processed.get(&"taxonomy".to_string());
164
165 if taxonomy_section.is_none() || id_map.is_empty() {
166 ghubs_config.write_processed_row(&processed)?;
167 continue;
168 }
169
170 if let Some(tax_section) = taxonomy_section {
171 if tax_section.get("taxon_id").is_none() {
172 let mut taxon_id_section = tax_section.clone();
173 taxon_id_section.insert("taxon_id".to_string(), "None".to_string());
174 processed.insert("taxonomy".to_string(), taxon_id_section);
176 }
177 }
178 let taxonomy_section = processed.get(&"taxonomy".to_string());
179 let taxon_names_section = processed.get(&"taxon_names".to_string());
180 let (assigned_taxon, taxon_match) =
181 match_taxonomy_section(taxonomy_section.unwrap(), id_map, Some(&fixed_names));
182 let taxon_name = taxon_match.taxon.name.clone();
183 combined_report.taxon_name = Some(taxon_name.clone());
185 if let Some(taxon) = &assigned_taxon {
186 match_counts.assigned += 1;
187 if let Some(taxon_names) = taxon_names_section {
188 add_new_names(taxon, taxon_names, &mut names, id_map, &xref_label);
189 }
190 ghubs_config.write_modified_row(
191 &processed,
192 "taxonomy",
193 "taxon_id".to_string(),
194 taxon.tax_id.clone().unwrap(),
195 )?;
196 } else {
197 match_counts.unassigned += 1;
198 }
199 let mut unmatched = false;
200 if let Some(status) = taxon_match.rank_status.as_ref() {
201 match status {
202 MatchStatus::Match(_) => match_counts.id_match += 1,
203 MatchStatus::MergeMatch(_) => match_counts.merge_match += 1,
204 MatchStatus::Mismatch(_) => {
205 match_counts.mismatch += 1;
206 combined_report.status = ValidationStatus::Mismatch;
207 combined_report.mismatch.push(taxon_match.clone());
208 validation_counts.mismatch += 1;
209
210 ghubs_config.write_exception(&combined_report);
211 }
212 MatchStatus::MultiMatch(_) => {
213 match_counts.multimatch += 1;
214 combined_report.status = ValidationStatus::Multimatch;
215 combined_report.multimatch.push(taxon_match.clone());
216 validation_counts.multimatch += 1;
217
218 ghubs_config.write_exception(&combined_report);
219 }
220 MatchStatus::PutativeMatch(_) => {
221 match_counts.putative += 1;
222
223 if assigned_taxon.is_none() {
224 combined_report.status = ValidationStatus::Putative;
225 combined_report.putative.push(taxon_match.clone());
226 validation_counts.putative += 1;
227
228 ghubs_config.write_exception(&combined_report);
229 }
230 }
231 MatchStatus::None => {
232 match_counts.none += 1;
233 unmatched = true;
234 combined_report.status = ValidationStatus::Nomatch;
235 validation_counts.nomatch += 1;
237
238 ghubs_config.write_exception(&combined_report);
239 }
240 }
241 } else if let Some(_options) = &taxon_match.rank_options {
242 match_counts.spellcheck += 1;
243 validation_counts.spellcheck += 1;
244 combined_report.status = ValidationStatus::Spellcheck;
245 combined_report.spellcheck.push(taxon_match.clone());
246 ghubs_config.write_exception(&combined_report);
247 } else {
248 match_counts.none += 1;
249 unmatched = true;
250 combined_report.status = ValidationStatus::Nomatch;
251 validation_counts.nomatch += 1;
253
254 ghubs_config.write_exception(&combined_report);
255 }
256 if unmatched && create_taxa {
257 let mut parent_tax_id = None;
259 let tax_section = taxonomy_section.unwrap();
260 let genus_name = if let Some(genus) = tax_section.get("genus") {
262 if !genus.is_empty() {
263 Some(genus.clone())
264 } else {
265 None
266 }
267 } else if let Some(species) = tax_section.get("species") {
268 species.split_whitespace().next().map(|s| s.to_string())
269 } else if let Some(subspecies) = tax_section.get("subspecies") {
270 subspecies.split_whitespace().next().map(|s| s.to_string())
271 } else {
272 None
273 };
274
275 if let Some(ref genus) = genus_name {
277 let genus_tax_id = if let Some(genus_infos) =
279 id_map.get(&CString::new(clean_name(genus)).unwrap())
280 {
281 genus_infos.first().map(|info| info.tax_id.clone())
283 } else {
284 None
285 };
286 if let Some(gtid) = genus_tax_id {
287 parent_tax_id = Some(gtid);
288 } else {
289 let genus_tax_id = format!("anc_{}", genus);
291 let genus_parent_tax_id = match &taxon_match.higher_status {
293 Some(MatchStatus::Match(parent))
294 | Some(MatchStatus::MergeMatch(parent))
295 | Some(MatchStatus::PutativeMatch(parent)) => {
296 parent.tax_id.clone().unwrap_or_else(|| "1".to_string())
297 }
298 _ => "1".to_string(),
299 };
300 let genus_node = Node {
301 tax_id: genus_tax_id.clone(),
302 parent_tax_id: genus_parent_tax_id,
303 rank: "genus".to_string(),
304 scientific_name: Some(genus.clone()),
305 names: Some(vec![Name {
306 tax_id: genus_tax_id.clone(),
307 name: genus.clone(),
308 unique_name: genus.clone(),
309 class: Some("scientific name".to_string()),
310 ..Default::default()
311 }]),
312 row_index: Some(row_index),
313 raw_row: Some(raw_row.clone()),
314 ..Default::default()
315 };
316 nodes.insert(genus_tax_id.clone(), genus_node);
317 parent_tax_id = Some(genus_tax_id);
318 }
319 }
320
321 let mut new_taxon_match = taxon_match.clone();
323 if let Some(ref parent_id) = parent_tax_id {
324 new_taxon_match.higher_status = Some(MatchStatus::PutativeMatch(Candidate {
326 tax_id: Some(parent_id.clone()),
327 rank: "genus".to_string(),
328 name: genus_name.clone().unwrap_or_default(),
329 anc_ids: None,
330 }));
331 }
332
333 if let Some(node) = add_new_taxid(
334 &new_taxon_match,
335 tax_section,
336 id_map,
337 Some(row_index),
338 Some(raw_row.clone()),
339 ) {
340 nodes.insert(node.tax_id.clone(), node.clone());
341 if let Some(taxon_names) = taxon_names_section {
342 add_new_names(
343 &Candidate {
344 tax_id: Some(node.tax_id.clone()),
345 ..Default::default()
346 },
347 taxon_names,
348 &mut names,
349 id_map,
350 &xref_label,
351 );
352 }
353 ghubs_config.write_modified_row(
354 &processed,
355 "taxonomy",
356 "taxon_id".to_string(),
357 node.tax_id.clone(),
358 )?;
359 }
361 }
362 }
363 pb.finish_with_message("done".to_string());
364 println!("Validation Report: {}", validation_counts.to_jsonl());
365 if write_validated {
366 write_updated_config(config_file, ghubs_config, keys);
368 }
369
370 println!("Taxon Assignment Report: {}", match_counts.to_jsonl());
371 Ok((names, nodes))
372}
373
374fn write_updated_config(config_file: &PathBuf, ghubs_config: &mut GHubsConfig, keys: Vec<&str>) {
375 let mut new_config_file = config_file.clone();
376 let config_file_name = config_file.file_name().unwrap().to_str().unwrap();
378 new_config_file.pop();
379 new_config_file.push("validated");
380 std::fs::create_dir_all(&new_config_file).unwrap();
381 new_config_file.push(config_file_name);
382 for key in keys.iter() {
383 if ghubs_config.get(key).is_some() {
384 for (field, value) in ghubs_config.get_mut(key).unwrap().iter_mut() {
385 value.header = Some(StringOrVec::Single(field.clone()));
386 }
387 }
388 }
389
390 let mut file = std::fs::File::create(&new_config_file).unwrap();
391 file.write_all(serde_yaml::to_string(&ghubs_config).unwrap().as_bytes())
393 .unwrap();
394}
395
396pub fn parse_file(
397 config_file: PathBuf,
398 id_map: &TreeMap<CString, Vec<TaxonInfo>>,
399 write_validated: bool,
400 create_taxa: bool,
401 xref_label: Option<String>,
402 skip_tsv: bool,
403) -> Result<(Nodes, HashMap<String, Vec<Name>>, Source), error::Error> {
404 let mut ghubs_config = GHubsConfig::new(&config_file)?;
407 let (names, tmp_nodes) = nodes_from_file(
409 &config_file,
410 &mut ghubs_config,
411 id_map,
412 write_validated,
413 create_taxa,
414 xref_label.clone(),
415 skip_tsv,
416 )?;
417 let mut nodes = Nodes {
418 nodes: HashMap::new(),
419 children: HashMap::new(),
420 };
421 let source = Source::new(&ghubs_config);
422 for (tax_id, node) in tmp_nodes.iter() {
423 let mut node = node.clone();
424 let unique_name = match &xref_label {
425 Some(label) => format!(
426 "{}:{}",
427 label,
428 node.scientific_name.clone().unwrap_or_default()
429 ),
430 None => String::new(),
431 };
432 let name = Name {
433 tax_id: tax_id.clone(),
434 name: node.scientific_name.clone().unwrap(),
435 unique_name,
436 class: Some("scientific name".to_string()),
437 ..Default::default()
438 };
439 if let Some(taxon_names) = names.get(tax_id) {
440 let mut all_names = taxon_names.clone();
441 all_names.push(name);
442 node.names = Some(all_names);
443 } else {
444 node.names = Some(vec![name]);
445 }
446 let parent = node.parent_tax_id.clone();
447 let child = node.tax_id();
448 if parent != child {
449 match nodes.children.entry(parent) {
450 Entry::Vacant(e) => {
451 e.insert(vec![child]);
452 }
453 Entry::Occupied(mut e) => {
454 e.get_mut().push(child);
455 }
456 }
457 }
458 nodes.nodes.insert(tax_id.clone(), node);
459 }
460
461 Ok((nodes, names, source))
467}
468
469fn lineage_deserialize<'de, D>(deserializer: D) -> Result<Vec<String>, D::Error>
471where
472 D: Deserializer<'de>,
473{
474 let str_sequence = String::deserialize(deserializer)?;
475 Ok(str_sequence
476 .split(';')
477 .map(|item| item.trim().to_owned())
478 .collect())
479}
480
481#[derive(Default, Serialize, Deserialize, Clone, Debug)]
483pub struct EnaTaxon {
484 #[serde(rename = "taxId")]
486 pub tax_id: String,
487 #[serde(rename = "scientificName")]
489 pub scientific_name: String,
490 pub rank: String,
492 #[serde(deserialize_with = "lineage_deserialize")]
494 pub lineage: Vec<String>,
495}
496
497#[cfg(test)]
498mod tests {
499 use super::*;
500
501 #[test]
502 fn test_parse_name() {
503 assert_eq!(
504 Name::parse("1 | all | | synonym |", &None).unwrap(),
505 (
506 "\t|",
507 Name {
508 tax_id: String::from("1"),
509 name: String::from("all"),
510 class: Some(String::from("synonym")),
511 ..Default::default()
512 }
513 )
514 );
515 }
516
517 #[test]
518 fn test_parse_node() {
519 assert_eq!(
520 Node::parse("1 | 1 | no rank |").unwrap(),
521 (
522 "\t|",
523 Node {
524 tax_id: String::from("1"),
525 parent_tax_id: String::from("1"),
526 rank: String::from("no rank"),
527 columns: vec![
528 "1".to_string(),
529 "1".to_string(),
530 "no rank".to_string(),
531 "".to_string(),
532 "".to_string(),
533 "".to_string(),
534 "".to_string(),
535 "".to_string(),
536 "".to_string(),
537 "".to_string(),
538 "".to_string(),
539 "".to_string(),
540 "".to_string()
541 ],
542 names: None,
543 scientific_name: None,
544 row_index: None,
545 raw_row: None,
546 }
547 )
548 );
549 assert_eq!(
550 Node::parse("2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |")
551 .unwrap(),
552 (
553 "\t|",
554 Node {
555 tax_id: String::from("2"),
556 parent_tax_id: String::from("131567"),
557 rank: String::from("superkingdom"),
558 columns: vec![
559 "2".to_string(),
560 "131567".to_string(),
561 "superkingdom".to_string(),
562 "".to_string(),
563 "0".to_string(),
564 "0".to_string(),
565 "11".to_string(),
566 "0".to_string(),
567 "0".to_string(),
568 "0".to_string(),
569 "0".to_string(),
570 "0".to_string(),
571 "".to_string()
572 ],
573 names: None,
574 scientific_name: None,
575 row_index: None,
576 raw_row: None,
577 }
578 )
579 );
580 }
581}