barcode_count/
output.rs

1use anyhow::{anyhow, Result};
2use chrono::{DateTime, Local};
3use num_format::{Locale, ToFormattedString};
4use std::{
5    fs::{File, OpenOptions},
6    io::{stdout, Write},
7    path::Path,
8    sync::{
9        atomic::{AtomicU32, Ordering},
10        Arc, Mutex,
11    },
12};
13
14use ahash::{AHashSet, HashMap, HashMapExt};
15
16use itertools::Itertools;
17
18use crate::{
19    arguments::Args,
20    info::{
21        MaxSeqErrors, Results, ResultsEnrichment, ResultsHashmap, SequenceErrors, SequenceFormat,
22    },
23};
24
25#[derive(PartialEq, Clone)]
26enum EnrichedType {
27    Single,
28    Double,
29    Full,
30}
31
32/// A struct setup to output results and stat information into files
33pub struct WriteFiles {
34    results: Results,
35    results_enriched: ResultsEnrichment,
36    sequence_format: SequenceFormat,
37    counted_barcodes_hash: Vec<HashMap<String, String>>,
38    samples_barcode_hash: HashMap<String, String>,
39    compounds_written: AHashSet<String>,
40    args: Args,
41    output_files: Vec<String>,
42    output_counts: Vec<usize>,
43    merged_count: usize,
44    merge_text: String,
45    sample_text: String,
46}
47
48impl WriteFiles {
49    pub fn new(
50        results_arc: Arc<Mutex<Results>>,
51        sequence_format: SequenceFormat,
52        counted_barcodes_hash: Vec<HashMap<String, String>>,
53        samples_barcode_hash: HashMap<String, String>,
54        args: Args,
55    ) -> Result<Self> {
56        let results = Arc::try_unwrap(results_arc).unwrap().into_inner().unwrap();
57        Ok(WriteFiles {
58            results,
59            results_enriched: ResultsEnrichment::new(),
60            sequence_format,
61            counted_barcodes_hash,
62            samples_barcode_hash,
63            compounds_written: AHashSet::new(),
64            args,
65            output_files: Vec::new(),
66            output_counts: Vec::new(),
67            merged_count: 0,
68            merge_text: String::new(),
69            sample_text: String::new(),
70        })
71    }
72
73    /// Sets up and writes the results file.  Works for either with or without a random barcode
74    pub fn write_counts_files(&mut self) -> Result<()> {
75        let unknown_sample = "barcode".to_string();
76        // Pull all sample IDs from either random hashmap or counts hashmap
77        let mut sample_barcodes = match &self.results.results_hashmap {
78            ResultsHashmap::RandomBarcode(random_hashmap) => {
79                random_hashmap.keys().cloned().collect::<Vec<String>>()
80            }
81            ResultsHashmap::NoRandomBarcode(count_hashmap) => {
82                count_hashmap.keys().cloned().collect::<Vec<String>>()
83            }
84        };
85
86        if self.args.enrich {
87            self.results_enriched.add_sample_barcodes(&sample_barcodes);
88        }
89
90        // If there was a sample conversion file, sort the barcodes by the sample IDs so that the columns for the merged file are in order
91        if !self.samples_barcode_hash.is_empty() {
92            sample_barcodes.sort_by_key(|barcode| {
93                self.samples_barcode_hash
94                    .get(barcode)
95                    .unwrap_or(&unknown_sample)
96            })
97        }
98
99        // create the directory variable to join the file to
100        let output_dir = self.args.output_dir.clone();
101        let directory = Path::new(&output_dir);
102
103        let mut header = self.create_header();
104        // If merged called, create the header with the sample names as columns and write
105        if self.args.merge_output {
106            if sample_barcodes.len() == 1 {
107                eprintln!("Merged file cannot be created without multiple sample barcodes");
108                println!();
109                self.args.merge_output = false;
110            } else {
111                // Create the merge file and push the header
112                let mut merged_header = header.clone();
113                for sample_barcode in &sample_barcodes {
114                    let sample_name = if self.samples_barcode_hash.is_empty() {
115                        sample_barcode
116                    } else {
117                        // Get the sample name from the sample barcode
118                        self
119                            .samples_barcode_hash
120                            .get(sample_barcode)
121                            .unwrap_or(&unknown_sample)
122                    };
123                    merged_header.push(',');
124                    merged_header.push_str(sample_name);
125                }
126                merged_header.push('\n');
127                self.merge_text.push_str(&merged_header);
128            }
129        }
130
131        // Crate the header to be used with each sample file.  This is just Barcode_1..Barcode_n and Count
132        header.push_str(",Count\n");
133
134        // For each sample, write the counts file
135        for sample_barcode in &sample_barcodes {
136            let sample_name = if !self.samples_barcode_hash.is_empty() {
137                self
138                    .samples_barcode_hash
139                    .get(sample_barcode)
140                    .unwrap_or(&unknown_sample)
141            } else {
142                sample_barcode
143            };
144            let file_name = format!("{}_{}_counts.csv", self.args.prefix, sample_name);
145            println!("{}", file_name);
146            self.output_files.push(file_name.clone());
147            // join the filename with the directory to create the full path
148            let output_path = directory.join(file_name);
149
150            self.sample_text.push_str(&header);
151            let count =
152                self.add_counts_string(sample_barcode, &sample_barcodes, EnrichedType::Full)?;
153
154            let mut output = File::create(output_path)?; // Create the output file
155            output.write_all(self.sample_text.as_bytes())?;
156            self.sample_text.clear();
157            self.output_counts.push(count);
158        }
159        if self.args.merge_output {
160            let merged_file_name = format!("{}{}", self.args.prefix, "_counts.all.csv");
161            println!("{}", merged_file_name);
162            println!(
163                "Barcodes counted: {}",
164                self.merged_count.to_formatted_string(&Locale::en)
165            );
166            self.output_files.push(merged_file_name.clone());
167            let merged_output_path = directory.join(merged_file_name);
168            let mut merged_output_file = File::create(merged_output_path)?;
169            merged_output_file.write_all(self.merge_text.as_bytes())?;
170            self.merge_text.clear();
171            self.output_counts.insert(0, self.merged_count);
172            self.merged_count = 0;
173        }
174        if self.args.enrich {
175            self.write_enriched_files(EnrichedType::Single)?;
176            if self.sequence_format.barcode_num > 2 {
177                self.write_enriched_files(EnrichedType::Double)?;
178            }
179        }
180        Ok(())
181    }
182
183    /// Creates the file header string for column headers
184    fn create_header(&self) -> String {
185        // Create a comma separated header.  First columns are the barcodes, 'Barcode_#'.  The last header is 'Count'
186        let mut header = String::new();
187        if self.sequence_format.barcode_num > 1 {
188            header = "Barcode_1".to_string();
189            for num in 1..self.sequence_format.barcode_num {
190                header.push_str(&format!(",Barcode_{}", num + 1))
191            }
192        } else {
193            header.push_str("Barcode")
194        }
195        header
196    }
197
198    /// Writes the files for when a random barcode is not included
199    fn add_counts_string(
200        &mut self,
201        sample_barcode: &str,
202        sample_barcodes: &[String],
203        enrichment: EnrichedType, // In order to make this non redundant with writing single and double barcodes, this enum determines some aspects
204    ) -> Result<usize> {
205        let mut hash_holder: HashMap<String, HashMap<String, usize>> = HashMap::new(); // a hodler hash to hold the hashmap from sample_counts_hash for a longer lifetime.  Also used later
206                                                                                       // Select from the hashmap connected the the EnrichedType
207        let codes = match enrichment {
208            EnrichedType::Single => {
209                hash_holder = self.results_enriched.single_hashmap.clone();
210                hash_holder
211                    .get(sample_barcode)
212                    .unwrap()
213                    .keys()
214                    .cloned()
215                    .collect::<Vec<String>>()
216            }
217            EnrichedType::Double => {
218                hash_holder = self.results_enriched.double_hashmap.clone();
219                hash_holder
220                    .get(sample_barcode)
221                    .unwrap()
222                    .keys()
223                    .cloned()
224                    .collect::<Vec<String>>()
225            }
226            EnrichedType::Full => match &self.results.results_hashmap {
227                ResultsHashmap::NoRandomBarcode(count_hashmap) => count_hashmap
228                    .get(sample_barcode)
229                    .unwrap()
230                    .keys()
231                    .cloned()
232                    .collect::<Vec<String>>(),
233                ResultsHashmap::RandomBarcode(random_hashmap) => random_hashmap
234                    .get(sample_barcode)
235                    .unwrap()
236                    .keys()
237                    .cloned()
238                    .collect::<Vec<String>>(),
239            },
240        };
241
242        let mut barcode_num = 0;
243        for (line_num, code) in codes.iter().enumerate() {
244            let count = match enrichment {
245                EnrichedType::Single => *self
246                    .results_enriched
247                    .single_hashmap
248                    .get(sample_barcode)
249                    .unwrap()
250                    .get(code)
251                    .unwrap(),
252                EnrichedType::Double => *self
253                    .results_enriched
254                    .double_hashmap
255                    .get(sample_barcode)
256                    .unwrap()
257                    .get(code)
258                    .unwrap(),
259                EnrichedType::Full => match &self.results.results_hashmap {
260                    ResultsHashmap::NoRandomBarcode(count_hashmap) => *count_hashmap
261                        .get(sample_barcode)
262                        .unwrap()
263                        .get(code)
264                        .unwrap(),
265                    ResultsHashmap::RandomBarcode(random_hashmap) => random_hashmap
266                        .get(sample_barcode)
267                        .unwrap()
268                        .get(code)
269                        .unwrap()
270                        .len(),
271                },
272            };
273            barcode_num = line_num + 1;
274            // Print the number counted so far ever 50,000 writes
275            if barcode_num % 50000 == 0 {
276                print!(
277                    "Barcodes counted: {}\r",
278                    barcode_num.to_formatted_string(&Locale::en)
279                );
280                stdout().flush()?;
281            }
282            let written_barcodes = if enrichment == EnrichedType::Full && !self.counted_barcodes_hash.is_empty() {
283                // Convert the building block DNA barcodes and join them back to comma separated
284                convert_code(code, &self.counted_barcodes_hash)
285            } else {
286                code.to_string()
287            };
288
289            // If merge output argument is called, pull data for the compound and write to merged file
290            if self.args.merge_output {
291                // If the compound has not already been written to the file proceed.  This will happen after the first sample is completed
292                let new = self.compounds_written.insert(code.to_string());
293                if new {
294                    self.merged_count += 1;
295                    // Start a new row with the converted building block barcodes
296                    let mut merged_row = written_barcodes.clone();
297                    // For every sample, retrieve the count and add to the row with a comma
298                    for sample_barcode in sample_barcodes {
299                        merged_row.push(',');
300                        // Get teh sample count from the hashmap that corresponds to the EnrichedType.  For single and double, it is the holding hashmap created earlier
301                        let sample_count = match enrichment {
302                            EnrichedType::Single => hash_holder
303                                .get(sample_barcode)
304                                .unwrap()
305                                .get(code)
306                                .unwrap_or(&0)
307                                .to_string(),
308
309                            EnrichedType::Double => hash_holder
310                                .get(sample_barcode)
311                                .unwrap()
312                                .get(code)
313                                .unwrap_or(&0)
314                                .to_string(),
315
316                            EnrichedType::Full => match &self.results.results_hashmap {
317                                ResultsHashmap::RandomBarcode(random_hashmap) => random_hashmap
318                                    .get(sample_barcode)
319                                    .unwrap()
320                                    .get(code)
321                                    .unwrap_or(&AHashSet::new())
322                                    .len()
323                                    .to_string(),
324                                ResultsHashmap::NoRandomBarcode(count_hashmap) => count_hashmap
325                                    .get(sample_barcode)
326                                    .unwrap()
327                                    .get(code)
328                                    .unwrap_or(&0)
329                                    .to_string(),
330                            },
331                        };
332                        merged_row.push_str(&sample_count);
333                    }
334                    merged_row.push('\n');
335                    // write to the merged file
336                    self.merge_text.push_str(&merged_row);
337                }
338            }
339            // Create the row for the sample file and write
340            let row = format!("{},{}\n", written_barcodes, count);
341            self.sample_text.push_str(&row);
342            // If enrichment type is Full, which is neither single nor double for adding string,
343            // and enrich is called.  Add 1 and 2 synthon enrichment.  This is becuase this smae
344            // method is called to create the 1 and 2 synthon strings, and therefore should only
345            // run when Full is used
346            if enrichment == EnrichedType::Full && self.args.enrich {
347                self.results_enriched
348                    .add_single(sample_barcode, &written_barcodes, count);
349                if self.sequence_format.barcode_num > 2 {
350                    self.results_enriched
351                        .add_double(sample_barcode, &written_barcodes, count);
352                }
353            }
354        }
355        print!(
356            "Barcodes counted: {}\r",
357            barcode_num.to_formatted_string(&Locale::en)
358        );
359        println!();
360        Ok(barcode_num)
361    }
362
363    /// Write enriched files for either single or double barcodes if either flag is called
364    fn write_enriched_files(&mut self, enrichment: EnrichedType) -> Result<()> {
365        let unknown_sample = "barcode".to_string();
366        // Pull all sample IDs from either single or double hashmap, which was added to in either random or counts write
367        let mut sample_barcodes = match enrichment {
368            EnrichedType::Single => self
369                .results_enriched
370                .single_hashmap
371                .keys()
372                .cloned()
373                .collect::<Vec<String>>(),
374            EnrichedType::Double => self
375                .results_enriched
376                .double_hashmap
377                .keys()
378                .cloned()
379                .collect::<Vec<String>>(),
380            EnrichedType::Full => {
381                return Err(anyhow!(
382                    "Does not work with Full enrichment type.  Only Single and Double"
383                ))
384            }
385        };
386
387        // If there was a sample conversion file, sort the barcodes by the sample IDs so that the columns for the merged file are in order
388        if !self.samples_barcode_hash.is_empty() {
389            sample_barcodes.sort_by_key(|barcode| {
390                self.samples_barcode_hash
391                    .get(barcode)
392                    .unwrap_or(&unknown_sample)
393            })
394        }
395
396        // Create a descriptor for output file names
397        let descriptor = match enrichment {
398            EnrichedType::Single => "Single",
399            EnrichedType::Double => "Double",
400            EnrichedType::Full => {
401                return Err(anyhow!(
402                    "Does not work with Full enrichment type.  Only Single and Double"
403                ))
404            }
405        };
406
407        // create the directory variable to join the file to
408        let output_dir = self.args.output_dir.clone();
409        let directory = Path::new(&output_dir);
410
411        let mut header = self.create_header();
412        // If merged called, create the header with the sample names as columns and write
413        if self.args.merge_output {
414            let mut merged_header = header.clone();
415            for sample_barcode in &sample_barcodes {
416                let sample_name = if self.samples_barcode_hash.is_empty() {
417                    sample_barcode
418                } else {
419                    // Get the sample name from the sample barcode
420                    self
421                        .samples_barcode_hash
422                        .get(sample_barcode)
423                        .unwrap_or(&unknown_sample)
424                };
425                merged_header.push(',');
426                merged_header.push_str(sample_name);
427            }
428            merged_header.push('\n');
429            self.merge_text.push_str(&merged_header);
430        }
431
432        // Crate the header to be used with each sample file.  This is just Barcode_1..Barcode_n and Count
433        header.push_str(",Count\n");
434
435        // For each sample, write the enriched file
436        for sample_barcode in &sample_barcodes {
437            // Create the file_name with the single or double descriptor
438            let sample_name = if !self.samples_barcode_hash.is_empty() {
439                self
440                    .samples_barcode_hash
441                    .get(sample_barcode)
442                    .unwrap_or(&unknown_sample)
443            } else {
444                sample_barcode
445            };
446            let file_name = format!(
447                "{}_{}_counts.{}.csv",
448                self.args.prefix, sample_name, descriptor
449            );
450            println!("{}", file_name);
451            self.output_files.push(file_name.clone());
452            // join the filename with the directory to create the full path
453            let output_path = directory.join(file_name);
454
455            self.sample_text.push_str(&header);
456            let count =
457                self.add_counts_string(sample_barcode, &sample_barcodes, enrichment.clone())?;
458            let mut output = File::create(output_path)?; // Create the output file
459            output.write_all(self.sample_text.as_bytes())?;
460            self.sample_text.clear();
461            // add the counts to output to stats later
462            self.output_counts.push(count);
463        }
464        // Add the count of merged barcodes if the flag is called
465        if self.args.merge_output {
466            // Create the merge file and push the header, if merged called within arguments
467            let merged_file_name = format!("{}_counts.all.{}.csv", self.args.prefix, descriptor);
468            println!("{}", merged_file_name);
469            self.output_files.push(merged_file_name.clone());
470            let merged_output_path = directory.join(merged_file_name);
471            let mut merged_output_file = File::create(merged_output_path)?;
472            merged_output_file.write_all(self.merge_text.as_bytes())?;
473            println!(
474                "Barcodes counted: {}",
475                self.merged_count.to_formatted_string(&Locale::en)
476            );
477            self.merge_text.clear();
478            self.output_counts.insert(
479                self.output_counts.len() - sample_barcodes.len(),
480                self.merged_count,
481            );
482            self.merged_count = 0;
483        }
484        Ok(())
485    }
486
487    /// Appends the stats information for record keeping
488    pub fn write_stats_file(
489        &self,
490        start_time: DateTime<Local>,
491        max_sequence_errors: MaxSeqErrors,
492        seq_errors: SequenceErrors,
493        total_reads: Arc<AtomicU32>,
494        sequence_format: SequenceFormat,
495    ) -> Result<()> {
496        // Create the stat file name
497        let output_dir = self.args.output_dir.clone();
498        let directory = Path::new(&output_dir);
499        let stat_filename = directory.join(format!("{}_barcode_stats.txt", self.args.prefix));
500        // Make the stat file and make it an appending function
501        let mut stat_file = OpenOptions::new()
502            .write(true)
503            .append(true)
504            .create(true)
505            .open(stat_filename)?;
506
507        // Get the total time the program took to run
508        let now = Local::now();
509        let elapsed_time = now - start_time;
510        // Write the time information to the stat file
511        stat_file.write_all(
512            format!(
513                "-TIME INFORMATION-\nStart: {}\nFinish: {}\nTotal time: {} hours, {} minutes, {}.{} seconds\n\n",
514                start_time.format("%Y-%m-%d %H:%M:%S"),
515                now.format("%Y-%m-%d %H:%M:%S"),
516                elapsed_time.num_hours(),
517                elapsed_time.num_minutes() % 60,
518                elapsed_time.num_seconds() % 60,
519                millisecond_decimal(elapsed_time)
520            )
521            .as_bytes(),
522        )?;
523        // Write the input file information
524        stat_file.write_all(
525            format!(
526                "-INPUT FILES-\nFastq: {}\nFormat: {}\nSamples: {}\nBarcodes: {}\n\n",
527                self.args.fastq,
528                self.args.format,
529                self.args
530                    .sample_barcodes_option
531                    .as_ref()
532                    .unwrap_or(&"None".to_string()),
533                self.args
534                    .counted_barcodes_option
535                    .as_ref()
536                    .unwrap_or(&"None".to_string())
537            )
538            .as_bytes(),
539        )?;
540        // Record the sequence_format
541        stat_file.write_all(format!("{}\n\n", sequence_format).as_bytes())?;
542        // Record the barcode information
543        stat_file.write_all(format!("{}\n", max_sequence_errors).as_bytes())?;
544        // Record the total reads and errors
545        stat_file.write_all(
546            format!(
547                "-RESULTS-\nTotal sequences:             {}\n{}\n\n",
548                total_reads
549                    .load(Ordering::Relaxed)
550                    .to_formatted_string(&Locale::en),
551                seq_errors
552            )
553            .as_bytes(),
554        )?;
555        // Record the files that were created
556        stat_file.write_all("-OUTPUT FILES-\n".as_bytes())?;
557        for (file_name, counts) in self.output_files.iter().zip(self.output_counts.iter()) {
558            stat_file.write_all(
559                format!(
560                    "File & barcodes counted: {}\t{}\n",
561                    file_name,
562                    counts.to_formatted_string(&Locale::en)
563                )
564                .as_bytes(),
565            )?;
566        }
567        stat_file.write_all("\n".as_bytes())?;
568        if self.args.fastq.ends_with("gz") && total_reads.load(Ordering::Relaxed) < 1_000_000 {
569            let warning = "WARNING: The program may have stopped early with the gzipped file.  Unzip the fastq.gz and rerun the algorithm on the unzipped fastq file if the number of reads is expected to be above 1,000,000 ";
570            println!("\n{}\n", warning);
571            stat_file.write_all(format!("\n{}\n", warning).as_bytes())?;
572        }
573        // Close the writing with dashes so that it is separated from the next analysis if it is done on the same day
574        stat_file.write_all("--------------------------------------------------------------------------------------------------\n\n\n".as_bytes())?;
575        Ok(())
576    }
577}
578
579pub fn millisecond_decimal(elapsed_time: chrono::Duration) -> String {
580    let milliseconds =
581        (elapsed_time.num_milliseconds() - (elapsed_time.num_seconds() * 1000)).to_string();
582    let mut final_string = String::new();
583    for _ in milliseconds.chars().count()..3 {
584        final_string.push('0');
585    }
586    final_string.push_str(&milliseconds);
587    final_string
588}
589
590/// Converst the DNA sequence from counted barcodes to the ID
591fn convert_code(code: &str, barcodes_hashmap: &[HashMap<String, String>]) -> String {
592    code.split(',')
593        .enumerate()
594        .map(|(barcode_index, barcode)| {
595            let barcode_hash = &barcodes_hashmap[barcode_index];
596            return barcode_hash.get(barcode).unwrap().to_string();
597        })
598        .join(",")
599}
600
601pub fn convert_sample_barcode(
602    sample_barcode: &str,
603    sample_barcodes_hash: &HashMap<String, String>,
604) -> String {
605    if let Some(sample_results) = sample_barcodes_hash.get(sample_barcode) {
606        sample_results.to_string()
607    } else {
608        "barcode".to_string()
609    }
610}