lorikeet_genome/reference/
reference_reader_utils.rs

1use bio::io::fasta::IndexedReader;
2use glob::glob;
3use needletail::parse_fastx_file;
4use std::process::{Stdio, exit, self};
5use std::collections::HashMap;
6use std::fs::File;
7use std::io::BufRead;
8use std::path::Path;
9use tempfile::NamedTempFile;
10
11use crate::external_command_checker;
12use crate::bam_parsing::mapping_index_maintenance::generate_concatenated_fasta_file;
13use crate::utils::utils::find_first;
14
15// lazy_static! {
16//     static ref GALAH_COMMAND_DEFINITION: GalahClustererCommandDefinition = {
17//         GalahClustererCommandDefinition {
18//             dereplication_ani_argument: "ani".to_string(),
19//             dereplication_prethreshold_ani_argument: "precluster-ani".to_string(),
20//             dereplication_quality_formula_argument: "quality-formula".to_string(),
21//             dereplication_precluster_method_argument: "precluster-method".to_string(),
22//             dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(),
23//             dereplication_fraglen_argument: "fragment-length".to_string(),
24//             dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(),
25//             dereplication_output_representative_fasta_directory:
26//                 "output-representative-fasta-directory".to_string(),
27//             dereplication_output_representative_fasta_directory_copy:
28//                 "output-representative-fasta-directory-copy".to_string(),
29//             dereplication_output_representative_list: "output-representative-list".to_string(),
30//         }
31//     };
32// }
33
34pub struct ReferenceReaderUtils {}
35
36impl ReferenceReaderUtils {
37    pub fn retrieve_reference(concatenated_genomes: &Option<String>) -> IndexedReader<File> {
38        let reference = match concatenated_genomes {
39            Some(reference_path) => match IndexedReader::from_file(&reference_path) {
40                Ok(reader) => reader,
41                Err(_e) => Self::generate_faidx(reference_path.as_str()),
42            },
43            None => panic!("Concatenated reference file does not exist"),
44        };
45
46        reference
47    }
48
49    pub fn extract_genome<'a>(tid: u32, target_names: &'a Vec<&[u8]>, split_char: u8) -> &'a [u8] {
50        let target_name = target_names[tid as usize];
51        trace!("target name {:?}, separator {:?}", target_name, split_char);
52        let offset = find_first(target_name, split_char).unwrap_or_else(|_|
53            panic!("Contig name {} does not contain split symbol, so cannot determine which genome it belongs to",
54                     std::str::from_utf8(target_name).unwrap()));
55        return &target_name[0..offset];
56    }
57
58    // Splits a contig name based on the ~
59    pub fn split_contig_name(target_name: &Vec<u8>) -> String {
60        String::from_utf8(target_name.clone())
61            .unwrap()
62            .splitn(2, '~')
63            .nth(1)
64            .unwrap_or_else(|| std::str::from_utf8(&target_name).unwrap())
65            .to_string()
66    }
67
68    pub fn setup_genome_fasta_files(
69        m: &clap::ArgMatches,
70    ) -> (Option<NamedTempFile>, Option<GenomesAndContigs>) {
71        let genome_fasta_files_opt = {
72            match bird_tool_utils::clap_utils::parse_list_of_genome_fasta_files(m, false) {
73                Ok(paths) => {
74                    if paths.len() == 0 {
75                        error!("Genome paths were described, but ultimately none were found");
76                        exit(1);
77                    }
78                    // if m.is_present("checkm-tab-table") || m.is_present("genome-info") {
79                    //     let genomes_after_filtering =
80                    //         galah::cluster_argument_parsing::filter_genomes_through_checkm(
81                    //             &paths,
82                    //             m,
83                    //             &Self::galah_command_line_definition(),
84                    //         )
85                    //         .expect("Error parsing CheckM-related options");
86                    //     info!(
87                    //         "After filtering by CheckM, {} genomes remained",
88                    //         genomes_after_filtering.len()
89                    //     );
90                    //     if genomes_after_filtering.len() == 0 {
91                    //         error!("All genomes were filtered out, so none remain to be mapped to");
92                    //         exit(1);
93                    //     }
94                    //     Some(
95                    //         genomes_after_filtering
96                    //             .iter()
97                    //             .map(|s| s.to_string())
98                    //             .collect(),
99                    //     )
100                    // } else {
101                    Some(paths)
102                    // }
103                }
104                Err(_) => None,
105            }
106        };
107
108        // debug!("Found paths {:?}", &genome_fasta_files_opt);
109
110        let (concatenated_genomes, genomes_and_contigs_option) = match m.contains_id("genome-fasta-files") {
111            true => match genome_fasta_files_opt {
112                Some(genome_paths) => (
113                    Some(
114                        generate_concatenated_fasta_file(
115                            &genome_paths,
116                        ),
117                    ),
118                    Self::extract_genomes_and_contigs_option(
119                        m,
120                        &genome_paths.iter().map(|s| s.as_str()).collect(),
121                    ),
122                ),
123                None => (None, None),
124            },
125            false => {
126                // Dereplicate if required
127                // TODO: Properly implement dereplication in cli.rs and make sure function works
128                let dereplicated_genomes: Vec<String> = if m.contains_id("dereplicate") {
129                    // Self::dereplicate(m, &genome_fasta_files_opt.unwrap())
130                    genome_fasta_files_opt.unwrap()
131                } else {
132                    genome_fasta_files_opt.unwrap()
133                };
134                // debug!("Profiling {} genomes", dereplicated_genomes.len());
135
136                let list_of_genome_fasta_files = &dereplicated_genomes;
137
138                (
139                    Some(
140                        generate_concatenated_fasta_file(
141                            list_of_genome_fasta_files,
142                        ),
143                    ),
144                    Self::extract_genomes_and_contigs_option(
145                        m,
146                        &dereplicated_genomes
147                            // .clone()
148                            .iter()
149                            .map(|s| s.as_str())
150                            .collect(),
151                    ),
152                )
153            }
154        };
155
156        // debug!("Found genome_and_contigs {:?}", &genomes_and_contigs_option);
157        return (concatenated_genomes, genomes_and_contigs_option);
158    }
159
160    pub fn parse_references(m: &clap::ArgMatches) -> Vec<String> {
161        let references = match m.get_many::<String>("genome-fasta-files") {
162            Some(vec) => {
163                let reference_paths = vec.map(|p| p.to_string()).collect::<Vec<String>>();
164                // debug!("Reference files {:?}", reference_paths);
165                reference_paths
166            }
167            None => match m.get_one::<String>("genome-fasta-directory") {
168                Some(path) => {
169                    let ext = m.get_one::<String>("genome-fasta-extension").unwrap();
170                    let reference_glob = format!("{}/*.{}", path, ext);
171                    let reference_paths = glob(&reference_glob)
172                        .expect("Failed to read cache")
173                        .map(|p| {
174                            p.expect("Failed to read cached bam path")
175                                .to_str()
176                                .unwrap()
177                                .to_string()
178                        })
179                        .collect::<Vec<String>>();
180                    // debug!("Reference files {:?}", reference_paths);
181                    reference_paths
182                }
183                None => panic!("Can't find suitable references for variant calling"),
184            },
185        };
186        return references;
187    }
188
189    pub fn extract_genomes_and_contigs_option(
190        m: &clap::ArgMatches,
191        genome_fasta_files: &Vec<&str>,
192    ) -> Option<GenomesAndContigs> {
193        match m.contains_id("genome-definition") {
194            true => Some(read_genome_definition_file(
195                m.get_one::<String>("genome-definition").unwrap(),
196            )),
197            false => Some(read_genome_fasta_files(
198                &genome_fasta_files,
199                false,
200            )),
201        }
202    }
203
204    pub fn generate_faidx(reference_path: &str) -> IndexedReader<File> {
205        // debug!("Generating reference index");
206        let cmd_string = format!(
207            "set -e -o pipefail; \
208            samtools faidx {}",
209            reference_path
210        );
211        // debug!("Queuing cmd_string: {}", cmd_string);
212        // check if {reference_path}.fai exists 
213        let fai_path = format!("{}.fai", reference_path);
214        if Path::new(&fai_path).exists() {
215            // debug!("Found existing index file at {}", fai_path);
216            return IndexedReader::from_file(&reference_path).expect("Unable to generate index");
217        }
218                    
219        external_command_checker::check_for_samtools();
220        std::process::Command::new("bash")
221            .arg("-c")
222            .arg(&cmd_string)
223            .stdout(Stdio::piped())
224            .output()
225            .expect("Unable to execute bash");
226
227        IndexedReader::from_file(&reference_path).expect("Unable to generate index")
228    }
229
230    // pub fn galah_command_line_definition() -> &'static GalahClustererCommandDefinition {
231    //     &*GALAH_COMMAND_DEFINITION
232    // }
233
234    // pub fn dereplicate(m: &clap::ArgMatches, genome_fasta_files: &Vec<String>) -> Vec<String> {
235    //     info!(
236    //         "Found {} genomes specified before dereplication",
237    //         genome_fasta_files.len()
238    //     );
239
240    //     // Generate clusterer and check for dependencies
241    //     let clusterer = galah::cluster_argument_parsing::generate_galah_clusterer(
242    //         genome_fasta_files,
243    //         &m,
244    //         Self::galah_command_line_definition(),
245    //     )
246    //     .expect("Failed to parse galah clustering arguments correctly");
247    //     galah::external_command_checker::check_for_dependencies();
248    //     info!("Dereplicating genome at {}% ANI ..", clusterer.ani * 100.);
249
250    //     let cluster_indices = clusterer.cluster();
251    //     info!(
252    //         "Finished dereplication, finding {} representative genomes.",
253    //         cluster_indices.len()
254    //     );
255    //     debug!("Found cluster indices: {:?}", cluster_indices);
256    //     let reps = cluster_indices
257    //         .iter()
258    //         .map(|cluster| genome_fasta_files[cluster[0]].clone())
259    //         .collect::<Vec<_>>();
260    //     debug!("Found cluster representatives: {:?}", reps);
261
262    //     if m.is_present("output-dereplication-clusters") {
263    //         let path = m.value_of("output-dereplication-clusters").unwrap();
264    //         info!("Writing dereplication cluster memberships to {}", path);
265    //         let mut f = std::fs::File::create(path)
266    //             .expect("Error creating dereplication cluster output file");
267    //         for cluster in cluster_indices.iter() {
268    //             let rep = cluster[0];
269    //             for member in cluster {
270    //                 writeln!(
271    //                     f,
272    //                     "{}\t{}",
273    //                     genome_fasta_files[rep], genome_fasta_files[*member]
274    //                 )
275    //                 .expect("Failed to write a specific line to dereplication cluster file");
276    //             }
277    //         }
278    //     }
279    //     reps
280    // }
281}
282
283#[derive(Debug, Clone)]
284pub struct GenomesAndContigs {
285    pub genomes: Vec<String>,
286    pub contigs: usize,
287}
288
289impl GenomesAndContigs {
290    pub fn new() -> Self {
291        GenomesAndContigs {
292            genomes: Vec::new(),
293            contigs: 0,
294        }
295    }
296
297    pub fn establish_genome(&mut self, genome_name: String) -> usize {
298        let index = self.genomes.len();
299        self.genomes.push(genome_name);
300        return index;
301    }
302
303    pub fn genome_index(&self, genome_name: &str) -> Option<usize> {
304        match find_first(self.genomes.as_slice(), genome_name.to_string()) {
305            Ok(index) => Some(index),
306            Err(_) => {
307                debug!("Genome {} not found in {:?}", genome_name, self.genomes);
308                None
309            },
310        }
311    }
312}
313
314pub fn read_genome_fasta_files(
315    fasta_file_paths: &Vec<&str>,
316    _use_full_sequence_name: bool,
317) -> GenomesAndContigs {
318    let mut contig_to_genome = GenomesAndContigs::new();
319
320    // NOTE: A lot of this code is shared with mapping_index_maintenance.rs#generate_concatenated_fasta_file
321    for file in fasta_file_paths {
322        let path = Path::new(file);
323        let mut reader =
324            parse_fastx_file(path).expect(&format!("Unable to read fasta file {}", file));
325
326        // Remove .gz .bz .xz from file names if present
327        let genome_name1 =
328            String::from(path.to_str().expect("File name string conversion problem"));
329        // if genome_name1 ends with .gz, .bz, or .xz, the Rust HTSLIB indexed reader
330        // will not be able to parse it. So we need to gracefully exit and inform the user
331        // that they need to decompress the file first. Additionally, if they have provided BAM
332        // files the BAM files will need to be regenerated with decompressed input or reheadered
333        // to remove the .gz, .bz, or .xz extension.
334        if genome_name1.ends_with(".gz") || genome_name1.ends_with(".bz") || genome_name1.ends_with(".xz") {
335            error!("The genome file {} is compressed. Please decompress the file before running lorikeet.", genome_name1);
336            error!("If you are using BAM files, please regenerate the BAM files with decompressed input");
337            error!("Or reheader the BAM files to remove the .gz, .bz, or .xz extension from genome names");
338            process::exit(1);
339        }
340        let path1 = Path::new(&genome_name1);
341
342        let genome_name = String::from(
343            path1
344                .file_stem()
345                .expect("Problem while determining file stem")
346                .to_str()
347                .expect("File name string conversion problem"),
348        );
349        if contig_to_genome.genome_index(&genome_name).is_some() {
350            error!("The genome name {} was derived from >1 file", genome_name);
351            exit(1);
352        }
353        let _genome_index = contig_to_genome.establish_genome(genome_name);
354        while let Some(record) = reader.next() {
355            let record_expected =
356                record.expect(&format!("Failed to parse record in fasta file {:?}", path));
357
358            if record_expected.format() != needletail::parser::Format::Fasta {
359                panic!(
360                    "File {:?} is not a fasta file, but a {:?}",
361                    path,
362                    record_expected.format()
363                );
364            }
365
366            contig_to_genome.contigs += 1;
367        }
368    }
369    return contig_to_genome;
370}
371
372pub fn read_genome_definition_file(definition_file_path: &str) -> GenomesAndContigs {
373    let f = std::fs::File::open(definition_file_path).expect(&format!(
374        "Unable to find/read genome definition file {}",
375        definition_file_path
376    ));
377    let file = std::io::BufReader::new(&f);
378    let mut genome_to_contig: HashMap<String, Vec<String>> = HashMap::new();
379    // Maintain the same order as the input file.
380    let mut genome_order: Vec<String> = vec![];
381    let mut contig_to_genome = GenomesAndContigs::new();
382    for line_res in file.lines() {
383        let line = line_res.expect("Read error on genome definition file");
384        let v: Vec<&str> = line.split("\t").collect();
385        if v.len() == 2 {
386            let genome = v[0].trim();
387            let contig = v[1]
388                .split_ascii_whitespace()
389                .next()
390                .expect("Failed to split contig name by whitespace in genome definition file");
391            contig_to_genome.contigs += 1;
392
393            if genome_to_contig.contains_key(genome) {
394                genome_to_contig
395                    .get_mut(genome)
396                    .unwrap()
397                    .push(contig.to_string());
398            } else {
399                genome_to_contig.insert(genome.to_string(), vec![contig.to_string()]);
400                genome_order.push(genome.to_string());
401            }
402        } else if v.len() == 0 {
403            continue;
404        } else {
405            error!(
406                "The line \"{}\" in the genome definition file is not a \
407                    genome name and contig name separated by a tab",
408                line
409            );
410            exit(1);
411        }
412    }
413
414    info!(
415        "Found {} contigs assigned to {} different genomes from \
416           the genome definition file",
417        contig_to_genome.contigs,
418        genome_to_contig.len()
419    );
420
421    for genome in genome_order {
422        let _contigs = &genome_to_contig[&genome];
423        let _genome_index = contig_to_genome.establish_genome(genome);
424    }
425    return contig_to_genome;
426}