Skip to main content

blobtk/
parse.rs

1use std::collections::hash_map::Entry;
2use std::collections::HashMap;
3use std::ffi::CString;
4use std::io::Write;
5use std::path::PathBuf;
6
7use blart::TreeMap;
8use indicatif::ProgressBar;
9use serde::{Deserialize, Deserializer, Serialize};
10
11/// Functions for name lookup.
12pub mod lookup;
13
14/// Functions for handling names and nodes
15pub mod nodes;
16
17/// Functions for handling GenomeHubs configuration files
18pub mod genomehubs;
19
20use crate::error;
21
22use genomehubs::{
23    GHubsConfig, SkipPartial, Source, StringOrVec, ValidationCounts, ValidationStatus,
24};
25use lookup::{
26    clean_name, match_taxonomy_section, Candidate, MatchCounts, MatchStatus, TaxonInfo, TaxonMatch,
27};
28use nodes::{Name, Node, Nodes};
29
30// Add new names to the taxonomy
31fn add_new_names(
32    taxon: &Candidate,
33    taxon_names: &HashMap<String, String>,
34    names: &mut HashMap<String, Vec<Name>>,
35    id_map: &TreeMap<CString, Vec<TaxonInfo>>,
36    xref_label: &Option<String>,
37) {
38    if taxon.tax_id.is_none() {
39        return;
40    }
41    let tax_id = taxon.tax_id.clone().unwrap();
42    for (name_class, name) in taxon_names.iter() {
43        if name == "None" || name == "NA" || name.is_empty() {
44            continue;
45        }
46        // does name already exist in id_map associated with the same class and taxid?
47        // if so, skip for now
48        if let Some(tax_info) = id_map.get(&CString::new(clean_name(name)).unwrap()) {
49            let mut found = false;
50            for info in tax_info {
51                if info.tax_id == tax_id {
52                    found = true;
53                }
54            }
55            if found {
56                continue;
57            }
58        }
59
60        let unique_name = match xref_label {
61            Some(label) => format!("{}:{}", label, name),
62            None => name.clone(),
63        };
64        let taxon_name = Name {
65            tax_id: tax_id.clone(),
66            name: name.clone(),
67            unique_name,
68            class: Some(name_class.replace('_', " ")),
69            ..Default::default()
70        };
71
72        names.entry(tax_id.clone()).or_default().push(taxon_name);
73    }
74}
75
76fn add_new_taxid(
77    taxon: &TaxonMatch,
78    taxonomy_section: &HashMap<String, String>,
79    _id_map: &TreeMap<CString, Vec<TaxonInfo>>,
80    row_index: Option<usize>,
81    raw_row: Option<String>,
82) -> Option<Node> {
83    // check taxonomy_section has a value for alt_taxon_id that is not None or NA
84    let alt_taxon_id;
85    if let Some(alt_id) = taxonomy_section.get("alt_taxon_id") {
86        if alt_id == "None" && alt_id == "NA" {
87            return None;
88        } else {
89            alt_taxon_id = alt_id;
90        }
91    } else {
92        return None;
93    }
94    let mut node = None;
95    if let Some(higher_status) = &taxon.higher_status {
96        if let MatchStatus::PutativeMatch(higher_candidate) = higher_status {
97            // attach directly to higher taxon for now
98            node = Some(Node {
99                tax_id: alt_taxon_id.clone(),
100                parent_tax_id: higher_candidate.tax_id.clone().unwrap(),
101                rank: taxon.taxon.rank.clone(),
102                scientific_name: Some(taxon.taxon.name.clone()),
103                names: None,
104                row_index,
105                raw_row,
106                ..Default::default()
107            });
108        }
109    }
110    node
111}
112
113// Parse taxa from a GenomeHubs data file
114fn nodes_from_file(
115    config_file: &PathBuf,
116    ghubs_config: &mut GHubsConfig,
117    id_map: &TreeMap<CString, Vec<TaxonInfo>>,
118    write_validated: bool,
119    create_taxa: bool,
120    xref_label: Option<String>,
121    skip_tsv: bool,
122) -> Result<(HashMap<String, Vec<Name>>, HashMap<String, Node>), error::Error> {
123    let keys = vec!["attributes", "taxon_names", "taxonomy"];
124    let mut fixed_names = HashMap::new();
125    ghubs_config.init_csv_reader(Some(keys.clone()), skip_tsv)?;
126    ghubs_config.init_file_writers(write_validated, true);
127    if !id_map.is_empty() {
128        ghubs_config.init_taxon_id();
129        fixed_names = ghubs_config.init_taxon_names();
130    }
131
132    let mut names = HashMap::new();
133    let mut nodes = HashMap::new();
134
135    let mut validation_counts: ValidationCounts = ValidationCounts::default();
136    let mut match_counts = MatchCounts::default();
137
138    let pb = ProgressBar::new_spinner();
139
140    for (row_index, result) in ghubs_config
141        .init_csv_reader(None, skip_tsv)?
142        .records()
143        .enumerate()
144    {
145        pb.set_message(format!("[+] {}", validation_counts.to_jsonl().as_str()));
146        pb.inc(1);
147        if let Err(err) = result {
148            let err: error::Error = err.into();
149            ghubs_config.handle_error(&err, row_index);
150            continue;
151        }
152        let record = result?;
153        let raw_row = record.iter().collect::<Vec<_>>().join("\t");
154        let (mut processed, mut combined_report) =
155            ghubs_config.validate_record(&record, row_index, &keys);
156        validation_counts.update(&combined_report.counts);
157        if combined_report.status == ValidationStatus::Partial
158            && ghubs_config.file.as_ref().unwrap().skip_partial == Some(SkipPartial::Row)
159        {
160            continue;
161        }
162
163        let taxonomy_section = processed.get(&"taxonomy".to_string());
164
165        if taxonomy_section.is_none() || id_map.is_empty() {
166            ghubs_config.write_processed_row(&processed)?;
167            continue;
168        }
169
170        if let Some(tax_section) = taxonomy_section {
171            if tax_section.get("taxon_id").is_none() {
172                let mut taxon_id_section = tax_section.clone();
173                taxon_id_section.insert("taxon_id".to_string(), "None".to_string());
174                // replace taxonomy section with new section
175                processed.insert("taxonomy".to_string(), taxon_id_section);
176            }
177        }
178        let taxonomy_section = processed.get(&"taxonomy".to_string());
179        let taxon_names_section = processed.get(&"taxon_names".to_string());
180        let (assigned_taxon, taxon_match) =
181            match_taxonomy_section(taxonomy_section.unwrap(), id_map, Some(&fixed_names));
182        let taxon_name = taxon_match.taxon.name.clone();
183        // add taxon name to combined report
184        combined_report.taxon_name = Some(taxon_name.clone());
185        if let Some(taxon) = &assigned_taxon {
186            match_counts.assigned += 1;
187            if let Some(taxon_names) = taxon_names_section {
188                add_new_names(taxon, taxon_names, &mut names, id_map, &xref_label);
189            }
190            ghubs_config.write_modified_row(
191                &processed,
192                "taxonomy",
193                "taxon_id".to_string(),
194                taxon.tax_id.clone().unwrap(),
195            )?;
196        } else {
197            match_counts.unassigned += 1;
198        }
199        let mut unmatched = false;
200        if let Some(status) = taxon_match.rank_status.as_ref() {
201            match status {
202                MatchStatus::Match(_) => match_counts.id_match += 1,
203                MatchStatus::MergeMatch(_) => match_counts.merge_match += 1,
204                MatchStatus::Mismatch(_) => {
205                    match_counts.mismatch += 1;
206                    combined_report.status = ValidationStatus::Mismatch;
207                    combined_report.mismatch.push(taxon_match.clone());
208                    validation_counts.mismatch += 1;
209
210                    ghubs_config.write_exception(&combined_report);
211                }
212                MatchStatus::MultiMatch(_) => {
213                    match_counts.multimatch += 1;
214                    combined_report.status = ValidationStatus::Multimatch;
215                    combined_report.multimatch.push(taxon_match.clone());
216                    validation_counts.multimatch += 1;
217
218                    ghubs_config.write_exception(&combined_report);
219                }
220                MatchStatus::PutativeMatch(_) => {
221                    match_counts.putative += 1;
222
223                    if assigned_taxon.is_none() {
224                        combined_report.status = ValidationStatus::Putative;
225                        combined_report.putative.push(taxon_match.clone());
226                        validation_counts.putative += 1;
227
228                        ghubs_config.write_exception(&combined_report);
229                    }
230                }
231                MatchStatus::None => {
232                    match_counts.none += 1;
233                    unmatched = true;
234                    combined_report.status = ValidationStatus::Nomatch;
235                    // combined_report.multimatch.push(taxon_match.clone());
236                    validation_counts.nomatch += 1;
237
238                    ghubs_config.write_exception(&combined_report);
239                }
240            }
241        } else if let Some(_options) = &taxon_match.rank_options {
242            match_counts.spellcheck += 1;
243            validation_counts.spellcheck += 1;
244            combined_report.status = ValidationStatus::Spellcheck;
245            combined_report.spellcheck.push(taxon_match.clone());
246            ghubs_config.write_exception(&combined_report);
247        } else {
248            match_counts.none += 1;
249            unmatched = true;
250            combined_report.status = ValidationStatus::Nomatch;
251            // combined_report.multimatch.push(taxon_match.clone());
252            validation_counts.nomatch += 1;
253
254            ghubs_config.write_exception(&combined_report);
255        }
256        if unmatched && create_taxa {
257            // Find/add parent genus first
258            let mut parent_tax_id = None;
259            let tax_section = taxonomy_section.unwrap();
260            // Try to get genus from taxonomy_section or from species/subspecies name
261            let genus_name = if let Some(genus) = tax_section.get("genus") {
262                if !genus.is_empty() {
263                    Some(genus.clone())
264                } else {
265                    None
266                }
267            } else if let Some(species) = tax_section.get("species") {
268                species.split_whitespace().next().map(|s| s.to_string())
269            } else if let Some(subspecies) = tax_section.get("subspecies") {
270                subspecies.split_whitespace().next().map(|s| s.to_string())
271            } else {
272                None
273            };
274
275            // Try to find or create genus node
276            if let Some(ref genus) = genus_name {
277                // Look up genus in id_map
278                let genus_tax_id = if let Some(genus_infos) =
279                    id_map.get(&CString::new(clean_name(genus)).unwrap())
280                {
281                    // Use first match if available
282                    genus_infos.first().map(|info| info.tax_id.clone())
283                } else {
284                    None
285                };
286                if let Some(gtid) = genus_tax_id {
287                    parent_tax_id = Some(gtid);
288                } else {
289                    // Create new genus node
290                    let genus_tax_id = format!("anc_{}", genus);
291                    // Set parent_tax_id to higher taxon match if available, else root
292                    let genus_parent_tax_id = match &taxon_match.higher_status {
293                        Some(MatchStatus::Match(parent))
294                        | Some(MatchStatus::MergeMatch(parent))
295                        | Some(MatchStatus::PutativeMatch(parent)) => {
296                            parent.tax_id.clone().unwrap_or_else(|| "1".to_string())
297                        }
298                        _ => "1".to_string(),
299                    };
300                    let genus_node = Node {
301                        tax_id: genus_tax_id.clone(),
302                        parent_tax_id: genus_parent_tax_id,
303                        rank: "genus".to_string(),
304                        scientific_name: Some(genus.clone()),
305                        names: Some(vec![Name {
306                            tax_id: genus_tax_id.clone(),
307                            name: genus.clone(),
308                            unique_name: genus.clone(),
309                            class: Some("scientific name".to_string()),
310                            ..Default::default()
311                        }]),
312                        row_index: Some(row_index),
313                        raw_row: Some(raw_row.clone()),
314                        ..Default::default()
315                    };
316                    nodes.insert(genus_tax_id.clone(), genus_node);
317                    parent_tax_id = Some(genus_tax_id);
318                }
319            }
320
321            // Now create the species/subspecies node, using genus as parent if found/created
322            let mut new_taxon_match = taxon_match.clone();
323            if let Some(ref parent_id) = parent_tax_id {
324                // Set higher_status to point to genus
325                new_taxon_match.higher_status = Some(MatchStatus::PutativeMatch(Candidate {
326                    tax_id: Some(parent_id.clone()),
327                    rank: "genus".to_string(),
328                    name: genus_name.clone().unwrap_or_default(),
329                    anc_ids: None,
330                }));
331            }
332
333            if let Some(node) = add_new_taxid(
334                &new_taxon_match,
335                tax_section,
336                id_map,
337                Some(row_index),
338                Some(raw_row.clone()),
339            ) {
340                nodes.insert(node.tax_id.clone(), node.clone());
341                if let Some(taxon_names) = taxon_names_section {
342                    add_new_names(
343                        &Candidate {
344                            tax_id: Some(node.tax_id.clone()),
345                            ..Default::default()
346                        },
347                        taxon_names,
348                        &mut names,
349                        id_map,
350                        &xref_label,
351                    );
352                }
353                ghubs_config.write_modified_row(
354                    &processed,
355                    "taxonomy",
356                    "taxon_id".to_string(),
357                    node.tax_id.clone(),
358                )?;
359                // TODO: add new taxid to id_map and increment counter
360            }
361        }
362    }
363    pb.finish_with_message("done".to_string());
364    println!("Validation Report: {}", validation_counts.to_jsonl());
365    if write_validated {
366        // write ghubs_config back to file in validated directory
367        write_updated_config(config_file, ghubs_config, keys);
368    }
369
370    println!("Taxon Assignment Report: {}", match_counts.to_jsonl());
371    Ok((names, nodes))
372}
373
374fn write_updated_config(config_file: &PathBuf, ghubs_config: &mut GHubsConfig, keys: Vec<&str>) {
375    let mut new_config_file = config_file.clone();
376    // get file name
377    let config_file_name = config_file.file_name().unwrap().to_str().unwrap();
378    new_config_file.pop();
379    new_config_file.push("validated");
380    std::fs::create_dir_all(&new_config_file).unwrap();
381    new_config_file.push(config_file_name);
382    for key in keys.iter() {
383        if ghubs_config.get(key).is_some() {
384            for (field, value) in ghubs_config.get_mut(key).unwrap().iter_mut() {
385                value.header = Some(StringOrVec::Single(field.clone()));
386            }
387        }
388    }
389
390    let mut file = std::fs::File::create(&new_config_file).unwrap();
391    // write ghubs_config YAML to file
392    file.write_all(serde_yaml::to_string(&ghubs_config).unwrap().as_bytes())
393        .unwrap();
394}
395
396pub fn parse_file(
397    config_file: PathBuf,
398    id_map: &TreeMap<CString, Vec<TaxonInfo>>,
399    write_validated: bool,
400    create_taxa: bool,
401    xref_label: Option<String>,
402    skip_tsv: bool,
403) -> Result<(Nodes, HashMap<String, Vec<Name>>, Source), error::Error> {
404    // let mut children = HashMap::new();
405
406    let mut ghubs_config = GHubsConfig::new(&config_file)?;
407    // let source = Source::new(&ghubs_config);
408    let (names, tmp_nodes) = nodes_from_file(
409        &config_file,
410        &mut ghubs_config,
411        id_map,
412        write_validated,
413        create_taxa,
414        xref_label.clone(),
415        skip_tsv,
416    )?;
417    let mut nodes = Nodes {
418        nodes: HashMap::new(),
419        children: HashMap::new(),
420    };
421    let source = Source::new(&ghubs_config);
422    for (tax_id, node) in tmp_nodes.iter() {
423        let mut node = node.clone();
424        let unique_name = match &xref_label {
425            Some(label) => format!(
426                "{}:{}",
427                label,
428                node.scientific_name.clone().unwrap_or_default()
429            ),
430            None => String::new(),
431        };
432        let name = Name {
433            tax_id: tax_id.clone(),
434            name: node.scientific_name.clone().unwrap(),
435            unique_name,
436            class: Some("scientific name".to_string()),
437            ..Default::default()
438        };
439        if let Some(taxon_names) = names.get(tax_id) {
440            let mut all_names = taxon_names.clone();
441            all_names.push(name);
442            node.names = Some(all_names);
443        } else {
444            node.names = Some(vec![name]);
445        }
446        let parent = node.parent_tax_id.clone();
447        let child = node.tax_id();
448        if parent != child {
449            match nodes.children.entry(parent) {
450                Entry::Vacant(e) => {
451                    e.insert(vec![child]);
452                }
453                Entry::Occupied(mut e) => {
454                    e.get_mut().push(child);
455                }
456            }
457        }
458        nodes.nodes.insert(tax_id.clone(), node);
459    }
460
461    // let mut rdr = ReaderBuilder::new()
462    //     .has_headers(false)
463    //     .delimiter(b'\t')
464    //     .from_path(gbif_backbone)?;
465
466    Ok((nodes, names, source))
467}
468
469/// Deserializer for lineage
470fn lineage_deserialize<'de, D>(deserializer: D) -> Result<Vec<String>, D::Error>
471where
472    D: Deserializer<'de>,
473{
474    let str_sequence = String::deserialize(deserializer)?;
475    Ok(str_sequence
476        .split(';')
477        .map(|item| item.trim().to_owned())
478        .collect())
479}
480
481/// ENA taxonomy record from taxonomy API
482#[derive(Default, Serialize, Deserialize, Clone, Debug)]
483pub struct EnaTaxon {
484    // Unique taxon ID
485    #[serde(rename = "taxId")]
486    pub tax_id: String,
487    // Scientific name
488    #[serde(rename = "scientificName")]
489    pub scientific_name: String,
490    // Taxonomic rank
491    pub rank: String,
492    // Lineage
493    #[serde(deserialize_with = "lineage_deserialize")]
494    pub lineage: Vec<String>,
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500
501    #[test]
502    fn test_parse_name() {
503        assert_eq!(
504            Name::parse("1	|	all	|		|	synonym	|", &None).unwrap(),
505            (
506                "\t|",
507                Name {
508                    tax_id: String::from("1"),
509                    name: String::from("all"),
510                    class: Some(String::from("synonym")),
511                    ..Default::default()
512                }
513            )
514        );
515    }
516
517    #[test]
518    fn test_parse_node() {
519        assert_eq!(
520            Node::parse("1	|	1	|	no rank	|").unwrap(),
521            (
522                "\t|",
523                Node {
524                    tax_id: String::from("1"),
525                    parent_tax_id: String::from("1"),
526                    rank: String::from("no rank"),
527                    columns: vec![
528                        "1".to_string(),
529                        "1".to_string(),
530                        "no rank".to_string(),
531                        "".to_string(),
532                        "".to_string(),
533                        "".to_string(),
534                        "".to_string(),
535                        "".to_string(),
536                        "".to_string(),
537                        "".to_string(),
538                        "".to_string(),
539                        "".to_string(),
540                        "".to_string()
541                    ],
542                    names: None,
543                    scientific_name: None,
544                    row_index: None,
545                    raw_row: None,
546                }
547            )
548        );
549        assert_eq!(
550            Node::parse("2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|")
551                .unwrap(),
552            (
553                "\t|",
554                Node {
555                    tax_id: String::from("2"),
556                    parent_tax_id: String::from("131567"),
557                    rank: String::from("superkingdom"),
558                    columns: vec![
559                        "2".to_string(),
560                        "131567".to_string(),
561                        "superkingdom".to_string(),
562                        "".to_string(),
563                        "0".to_string(),
564                        "0".to_string(),
565                        "11".to_string(),
566                        "0".to_string(),
567                        "0".to_string(),
568                        "0".to_string(),
569                        "0".to_string(),
570                        "0".to_string(),
571                        "".to_string()
572                    ],
573                    names: None,
574                    scientific_name: None,
575                    row_index: None,
576                    raw_row: None,
577                }
578            )
579        );
580    }
581}