lorikeet_genome/reference/reference_reader_utils.rs
1use bio::io::fasta::IndexedReader;
2use glob::glob;
3use needletail::parse_fastx_file;
4use std::process::{Stdio, exit, self};
5use std::collections::HashMap;
6use std::fs::File;
7use std::io::BufRead;
8use std::path::Path;
9use tempfile::NamedTempFile;
10
11use crate::external_command_checker;
12use crate::bam_parsing::mapping_index_maintenance::generate_concatenated_fasta_file;
13use crate::utils::utils::find_first;
14
15// lazy_static! {
16// static ref GALAH_COMMAND_DEFINITION: GalahClustererCommandDefinition = {
17// GalahClustererCommandDefinition {
18// dereplication_ani_argument: "ani".to_string(),
19// dereplication_prethreshold_ani_argument: "precluster-ani".to_string(),
20// dereplication_quality_formula_argument: "quality-formula".to_string(),
21// dereplication_precluster_method_argument: "precluster-method".to_string(),
22// dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(),
23// dereplication_fraglen_argument: "fragment-length".to_string(),
24// dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(),
25// dereplication_output_representative_fasta_directory:
26// "output-representative-fasta-directory".to_string(),
27// dereplication_output_representative_fasta_directory_copy:
28// "output-representative-fasta-directory-copy".to_string(),
29// dereplication_output_representative_list: "output-representative-list".to_string(),
30// }
31// };
32// }
33
34pub struct ReferenceReaderUtils {}
35
36impl ReferenceReaderUtils {
37 pub fn retrieve_reference(concatenated_genomes: &Option<String>) -> IndexedReader<File> {
38 let reference = match concatenated_genomes {
39 Some(reference_path) => match IndexedReader::from_file(&reference_path) {
40 Ok(reader) => reader,
41 Err(_e) => Self::generate_faidx(reference_path.as_str()),
42 },
43 None => panic!("Concatenated reference file does not exist"),
44 };
45
46 reference
47 }
48
49 pub fn extract_genome<'a>(tid: u32, target_names: &'a Vec<&[u8]>, split_char: u8) -> &'a [u8] {
50 let target_name = target_names[tid as usize];
51 trace!("target name {:?}, separator {:?}", target_name, split_char);
52 let offset = find_first(target_name, split_char).unwrap_or_else(|_|
53 panic!("Contig name {} does not contain split symbol, so cannot determine which genome it belongs to",
54 std::str::from_utf8(target_name).unwrap()));
55 return &target_name[0..offset];
56 }
57
58 // Splits a contig name based on the ~
59 pub fn split_contig_name(target_name: &Vec<u8>) -> String {
60 String::from_utf8(target_name.clone())
61 .unwrap()
62 .splitn(2, '~')
63 .nth(1)
64 .unwrap_or_else(|| std::str::from_utf8(&target_name).unwrap())
65 .to_string()
66 }
67
68 pub fn setup_genome_fasta_files(
69 m: &clap::ArgMatches,
70 ) -> (Option<NamedTempFile>, Option<GenomesAndContigs>) {
71 let genome_fasta_files_opt = {
72 match bird_tool_utils::clap_utils::parse_list_of_genome_fasta_files(m, false) {
73 Ok(paths) => {
74 if paths.len() == 0 {
75 error!("Genome paths were described, but ultimately none were found");
76 exit(1);
77 }
78 // if m.is_present("checkm-tab-table") || m.is_present("genome-info") {
79 // let genomes_after_filtering =
80 // galah::cluster_argument_parsing::filter_genomes_through_checkm(
81 // &paths,
82 // m,
83 // &Self::galah_command_line_definition(),
84 // )
85 // .expect("Error parsing CheckM-related options");
86 // info!(
87 // "After filtering by CheckM, {} genomes remained",
88 // genomes_after_filtering.len()
89 // );
90 // if genomes_after_filtering.len() == 0 {
91 // error!("All genomes were filtered out, so none remain to be mapped to");
92 // exit(1);
93 // }
94 // Some(
95 // genomes_after_filtering
96 // .iter()
97 // .map(|s| s.to_string())
98 // .collect(),
99 // )
100 // } else {
101 Some(paths)
102 // }
103 }
104 Err(_) => None,
105 }
106 };
107
108 // debug!("Found paths {:?}", &genome_fasta_files_opt);
109
110 let (concatenated_genomes, genomes_and_contigs_option) = match m.contains_id("genome-fasta-files") {
111 true => match genome_fasta_files_opt {
112 Some(genome_paths) => (
113 Some(
114 generate_concatenated_fasta_file(
115 &genome_paths,
116 ),
117 ),
118 Self::extract_genomes_and_contigs_option(
119 m,
120 &genome_paths.iter().map(|s| s.as_str()).collect(),
121 ),
122 ),
123 None => (None, None),
124 },
125 false => {
126 // Dereplicate if required
127 // TODO: Properly implement dereplication in cli.rs and make sure function works
128 let dereplicated_genomes: Vec<String> = if m.contains_id("dereplicate") {
129 // Self::dereplicate(m, &genome_fasta_files_opt.unwrap())
130 genome_fasta_files_opt.unwrap()
131 } else {
132 genome_fasta_files_opt.unwrap()
133 };
134 // debug!("Profiling {} genomes", dereplicated_genomes.len());
135
136 let list_of_genome_fasta_files = &dereplicated_genomes;
137
138 (
139 Some(
140 generate_concatenated_fasta_file(
141 list_of_genome_fasta_files,
142 ),
143 ),
144 Self::extract_genomes_and_contigs_option(
145 m,
146 &dereplicated_genomes
147 // .clone()
148 .iter()
149 .map(|s| s.as_str())
150 .collect(),
151 ),
152 )
153 }
154 };
155
156 // debug!("Found genome_and_contigs {:?}", &genomes_and_contigs_option);
157 return (concatenated_genomes, genomes_and_contigs_option);
158 }
159
160 pub fn parse_references(m: &clap::ArgMatches) -> Vec<String> {
161 let references = match m.get_many::<String>("genome-fasta-files") {
162 Some(vec) => {
163 let reference_paths = vec.map(|p| p.to_string()).collect::<Vec<String>>();
164 // debug!("Reference files {:?}", reference_paths);
165 reference_paths
166 }
167 None => match m.get_one::<String>("genome-fasta-directory") {
168 Some(path) => {
169 let ext = m.get_one::<String>("genome-fasta-extension").unwrap();
170 let reference_glob = format!("{}/*.{}", path, ext);
171 let reference_paths = glob(&reference_glob)
172 .expect("Failed to read cache")
173 .map(|p| {
174 p.expect("Failed to read cached bam path")
175 .to_str()
176 .unwrap()
177 .to_string()
178 })
179 .collect::<Vec<String>>();
180 // debug!("Reference files {:?}", reference_paths);
181 reference_paths
182 }
183 None => panic!("Can't find suitable references for variant calling"),
184 },
185 };
186 return references;
187 }
188
189 pub fn extract_genomes_and_contigs_option(
190 m: &clap::ArgMatches,
191 genome_fasta_files: &Vec<&str>,
192 ) -> Option<GenomesAndContigs> {
193 match m.contains_id("genome-definition") {
194 true => Some(read_genome_definition_file(
195 m.get_one::<String>("genome-definition").unwrap(),
196 )),
197 false => Some(read_genome_fasta_files(
198 &genome_fasta_files,
199 false,
200 )),
201 }
202 }
203
204 pub fn generate_faidx(reference_path: &str) -> IndexedReader<File> {
205 // debug!("Generating reference index");
206 let cmd_string = format!(
207 "set -e -o pipefail; \
208 samtools faidx {}",
209 reference_path
210 );
211 // debug!("Queuing cmd_string: {}", cmd_string);
212 // check if {reference_path}.fai exists
213 let fai_path = format!("{}.fai", reference_path);
214 if Path::new(&fai_path).exists() {
215 // debug!("Found existing index file at {}", fai_path);
216 return IndexedReader::from_file(&reference_path).expect("Unable to generate index");
217 }
218
219 external_command_checker::check_for_samtools();
220 std::process::Command::new("bash")
221 .arg("-c")
222 .arg(&cmd_string)
223 .stdout(Stdio::piped())
224 .output()
225 .expect("Unable to execute bash");
226
227 IndexedReader::from_file(&reference_path).expect("Unable to generate index")
228 }
229
230 // pub fn galah_command_line_definition() -> &'static GalahClustererCommandDefinition {
231 // &*GALAH_COMMAND_DEFINITION
232 // }
233
234 // pub fn dereplicate(m: &clap::ArgMatches, genome_fasta_files: &Vec<String>) -> Vec<String> {
235 // info!(
236 // "Found {} genomes specified before dereplication",
237 // genome_fasta_files.len()
238 // );
239
240 // // Generate clusterer and check for dependencies
241 // let clusterer = galah::cluster_argument_parsing::generate_galah_clusterer(
242 // genome_fasta_files,
243 // &m,
244 // Self::galah_command_line_definition(),
245 // )
246 // .expect("Failed to parse galah clustering arguments correctly");
247 // galah::external_command_checker::check_for_dependencies();
248 // info!("Dereplicating genome at {}% ANI ..", clusterer.ani * 100.);
249
250 // let cluster_indices = clusterer.cluster();
251 // info!(
252 // "Finished dereplication, finding {} representative genomes.",
253 // cluster_indices.len()
254 // );
255 // debug!("Found cluster indices: {:?}", cluster_indices);
256 // let reps = cluster_indices
257 // .iter()
258 // .map(|cluster| genome_fasta_files[cluster[0]].clone())
259 // .collect::<Vec<_>>();
260 // debug!("Found cluster representatives: {:?}", reps);
261
262 // if m.is_present("output-dereplication-clusters") {
263 // let path = m.value_of("output-dereplication-clusters").unwrap();
264 // info!("Writing dereplication cluster memberships to {}", path);
265 // let mut f = std::fs::File::create(path)
266 // .expect("Error creating dereplication cluster output file");
267 // for cluster in cluster_indices.iter() {
268 // let rep = cluster[0];
269 // for member in cluster {
270 // writeln!(
271 // f,
272 // "{}\t{}",
273 // genome_fasta_files[rep], genome_fasta_files[*member]
274 // )
275 // .expect("Failed to write a specific line to dereplication cluster file");
276 // }
277 // }
278 // }
279 // reps
280 // }
281}
282
283#[derive(Debug, Clone)]
284pub struct GenomesAndContigs {
285 pub genomes: Vec<String>,
286 pub contigs: usize,
287}
288
289impl GenomesAndContigs {
290 pub fn new() -> Self {
291 GenomesAndContigs {
292 genomes: Vec::new(),
293 contigs: 0,
294 }
295 }
296
297 pub fn establish_genome(&mut self, genome_name: String) -> usize {
298 let index = self.genomes.len();
299 self.genomes.push(genome_name);
300 return index;
301 }
302
303 pub fn genome_index(&self, genome_name: &str) -> Option<usize> {
304 match find_first(self.genomes.as_slice(), genome_name.to_string()) {
305 Ok(index) => Some(index),
306 Err(_) => {
307 debug!("Genome {} not found in {:?}", genome_name, self.genomes);
308 None
309 },
310 }
311 }
312}
313
314pub fn read_genome_fasta_files(
315 fasta_file_paths: &Vec<&str>,
316 _use_full_sequence_name: bool,
317) -> GenomesAndContigs {
318 let mut contig_to_genome = GenomesAndContigs::new();
319
320 // NOTE: A lot of this code is shared with mapping_index_maintenance.rs#generate_concatenated_fasta_file
321 for file in fasta_file_paths {
322 let path = Path::new(file);
323 let mut reader =
324 parse_fastx_file(path).expect(&format!("Unable to read fasta file {}", file));
325
326 // Remove .gz .bz .xz from file names if present
327 let genome_name1 =
328 String::from(path.to_str().expect("File name string conversion problem"));
329 // if genome_name1 ends with .gz, .bz, or .xz, the Rust HTSLIB indexed reader
330 // will not be able to parse it. So we need to gracefully exit and inform the user
331 // that they need to decompress the file first. Additionally, if they have provided BAM
332 // files the BAM files will need to be regenerated with decompressed input or reheadered
333 // to remove the .gz, .bz, or .xz extension.
334 if genome_name1.ends_with(".gz") || genome_name1.ends_with(".bz") || genome_name1.ends_with(".xz") {
335 error!("The genome file {} is compressed. Please decompress the file before running lorikeet.", genome_name1);
336 error!("If you are using BAM files, please regenerate the BAM files with decompressed input");
337 error!("Or reheader the BAM files to remove the .gz, .bz, or .xz extension from genome names");
338 process::exit(1);
339 }
340 let path1 = Path::new(&genome_name1);
341
342 let genome_name = String::from(
343 path1
344 .file_stem()
345 .expect("Problem while determining file stem")
346 .to_str()
347 .expect("File name string conversion problem"),
348 );
349 if contig_to_genome.genome_index(&genome_name).is_some() {
350 error!("The genome name {} was derived from >1 file", genome_name);
351 exit(1);
352 }
353 let _genome_index = contig_to_genome.establish_genome(genome_name);
354 while let Some(record) = reader.next() {
355 let record_expected =
356 record.expect(&format!("Failed to parse record in fasta file {:?}", path));
357
358 if record_expected.format() != needletail::parser::Format::Fasta {
359 panic!(
360 "File {:?} is not a fasta file, but a {:?}",
361 path,
362 record_expected.format()
363 );
364 }
365
366 contig_to_genome.contigs += 1;
367 }
368 }
369 return contig_to_genome;
370}
371
372pub fn read_genome_definition_file(definition_file_path: &str) -> GenomesAndContigs {
373 let f = std::fs::File::open(definition_file_path).expect(&format!(
374 "Unable to find/read genome definition file {}",
375 definition_file_path
376 ));
377 let file = std::io::BufReader::new(&f);
378 let mut genome_to_contig: HashMap<String, Vec<String>> = HashMap::new();
379 // Maintain the same order as the input file.
380 let mut genome_order: Vec<String> = vec![];
381 let mut contig_to_genome = GenomesAndContigs::new();
382 for line_res in file.lines() {
383 let line = line_res.expect("Read error on genome definition file");
384 let v: Vec<&str> = line.split("\t").collect();
385 if v.len() == 2 {
386 let genome = v[0].trim();
387 let contig = v[1]
388 .split_ascii_whitespace()
389 .next()
390 .expect("Failed to split contig name by whitespace in genome definition file");
391 contig_to_genome.contigs += 1;
392
393 if genome_to_contig.contains_key(genome) {
394 genome_to_contig
395 .get_mut(genome)
396 .unwrap()
397 .push(contig.to_string());
398 } else {
399 genome_to_contig.insert(genome.to_string(), vec![contig.to_string()]);
400 genome_order.push(genome.to_string());
401 }
402 } else if v.len() == 0 {
403 continue;
404 } else {
405 error!(
406 "The line \"{}\" in the genome definition file is not a \
407 genome name and contig name separated by a tab",
408 line
409 );
410 exit(1);
411 }
412 }
413
414 info!(
415 "Found {} contigs assigned to {} different genomes from \
416 the genome definition file",
417 contig_to_genome.contigs,
418 genome_to_contig.len()
419 );
420
421 for genome in genome_order {
422 let _contigs = &genome_to_contig[&genome];
423 let _genome_index = contig_to_genome.establish_genome(genome);
424 }
425 return contig_to_genome;
426}