grepq/inverted.rs
1// MIT License
2
3// Copyright (c) 2024 - present Nicholas D. Crosbie
4
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23// Main functionality for the inverted command.
24// This command filters FASTQ records using a variety of criteria (sequence length, quality, header, regex)
25// and either counts matching records or outputs them in one of several formats.
26use crate::arg::Cli;
27use crate::initialise::{create_reader, create_writer, parse_patterns_file};
28use crate::output::{write_full_record, write_record_with_fasta, write_record_with_id};
29use regex::bytes::Regex;
30use seq_io::fastq::Record;
31use seq_io::parallel::parallel_fastq;
32use std::io::Write;
33
34// Main function to run the inverted command
35pub fn run_inverted(cli: &Cli) {
36 // Extract CLI options for output mode.
37 let with_id = cli.with_id;
38 let with_full_record = cli.with_full_record;
39 let with_fasta = cli.with_fasta;
40 let count = cli.count;
41
42 // Parse the patterns file to extract regex patterns and optional filter parameters.
43 let (regex_set, header_regex, minimum_sequence_length, minimum_quality, quality_encoding, _, _) =
44 parse_patterns_file(&cli.patterns)
45 .map_err(std::io::Error::other)
46 .unwrap();
47
48 // Compile the optional header regex.
49 let header_regex = header_regex.map(|re| {
50 // Compile header regex filter.
51 Regex::new(&re).unwrap()
52 });
53
54 // Create input reader and output writer based on CLI flags.
55 let reader = create_reader(cli);
56 let mut writer = create_writer(cli);
57
58 // Determine which filters to apply.
59 let check_seq_len = minimum_sequence_length.is_some();
60 let check_qual = minimum_quality.is_some();
61 let check_header = header_regex.is_some();
62
63 // Initialize buffers to reuse memory.
64 let mut seq_buffer = Vec::new();
65 let mut qual_buffer = Vec::new();
66 let mut head_buffer = Vec::new();
67
68 if count {
69 // Count mode: Only count records that match the filter criteria.
70 let mut match_count = 0;
71 parallel_fastq(
72 reader,
73 num_cpus::get() as u32,
74 num_cpus::get(),
75 |record, found| {
76 // Worker thread: Apply filters on each record.
77 *found = false;
78 let seq_len_check = !check_seq_len
79 || record.seq().len() >= minimum_sequence_length.unwrap() as usize;
80 let qual_check = !check_qual
81 || crate::quality::average_quality(
82 record.qual(),
83 quality_encoding.as_deref().unwrap_or("Phred+33"),
84 ) >= minimum_quality.unwrap();
85 let header_check =
86 !check_header || header_regex.as_ref().unwrap().is_match(record.head());
87 let regex_check = !regex_set.is_match(record.seq());
88
89 // Mark record as matching if all conditions are met.
90 if seq_len_check && qual_check && header_check && regex_check {
91 *found = true;
92 }
93 },
94 |_, found| {
95 // Main thread: Increment count based on the worker's flag.
96 if *found {
97 match_count += 1;
98 }
99 None::<()>
100 },
101 )
102 .unwrap();
103 // Output the count.
104 writeln!(writer, "{}", match_count).unwrap();
105 } else {
106 // Record output mode: Write records based on the selected output format.
107 parallel_fastq(
108 reader,
109 num_cpus::get() as u32,
110 num_cpus::get(),
111 |record, found| {
112 // Worker thread: Check filter criteria.
113 *found = false;
114 let seq_len_check = !check_seq_len
115 || record.seq().len() >= minimum_sequence_length.unwrap() as usize;
116 let qual_check = !check_qual
117 || crate::quality::average_quality(
118 record.qual(),
119 quality_encoding.as_deref().unwrap_or("Phred+33"),
120 ) >= minimum_quality.unwrap();
121 let header_check =
122 !check_header || header_regex.as_ref().unwrap().is_match(record.head());
123 let regex_check = !regex_set.is_match(record.seq());
124
125 if seq_len_check && qual_check && header_check && regex_check {
126 *found = true;
127 }
128 },
129 |record, found| {
130 // Main thread: Write the record in the appropriate format if it passed the filters.
131 if *found {
132 if with_id {
133 // Output only the record ID.
134 write_record_with_id(
135 &mut writer,
136 &record,
137 &mut head_buffer,
138 &mut seq_buffer,
139 );
140 } else if with_full_record {
141 // Output the full FASTQ record.
142 write_full_record(
143 &mut writer,
144 &record,
145 &mut head_buffer,
146 &mut seq_buffer,
147 &mut qual_buffer,
148 );
149 } else if with_fasta {
150 // Output in FASTA format.
151 write_record_with_fasta(
152 &mut writer,
153 &record,
154 &mut head_buffer,
155 &mut seq_buffer,
156 );
157 } else {
158 // Default: output the raw sequence followed by a newline.
159 writer.write_all(record.seq()).unwrap();
160 writer.write_all(b"\n").unwrap();
161 }
162 }
163 None::<()>
164 },
165 )
166 .unwrap();
167 }
168}