Skip to main content

grepq/
inverted.rs

1// MIT License
2
3// Copyright (c) 2024 - present Nicholas D. Crosbie
4
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23// Main functionality for the inverted command.
24// This command filters FASTQ records using a variety of criteria (sequence length, quality, header, regex)
25// and either counts matching records or outputs them in one of several formats.
26use crate::arg::Cli;
27use crate::initialise::{create_reader, create_writer, parse_patterns_file};
28use crate::output::{write_full_record, write_record_with_fasta, write_record_with_id};
29use regex::bytes::Regex;
30use seq_io::fastq::Record;
31use seq_io::parallel::parallel_fastq;
32use std::io::Write;
33
34// Main function to run the inverted command
35pub fn run_inverted(cli: &Cli) {
36    // Extract CLI options for output mode.
37    let with_id = cli.with_id;
38    let with_full_record = cli.with_full_record;
39    let with_fasta = cli.with_fasta;
40    let count = cli.count;
41
42    // Parse the patterns file to extract regex patterns and optional filter parameters.
43    let (regex_set, header_regex, minimum_sequence_length, minimum_quality, quality_encoding, _, _) =
44        parse_patterns_file(&cli.patterns)
45            .map_err(std::io::Error::other)
46            .unwrap();
47
48    // Compile the optional header regex.
49    let header_regex = header_regex.map(|re| {
50        // Compile header regex filter.
51        Regex::new(&re).unwrap()
52    });
53
54    // Create input reader and output writer based on CLI flags.
55    let reader = create_reader(cli);
56    let mut writer = create_writer(cli);
57
58    // Determine which filters to apply.
59    let check_seq_len = minimum_sequence_length.is_some();
60    let check_qual = minimum_quality.is_some();
61    let check_header = header_regex.is_some();
62
63    // Initialize buffers to reuse memory.
64    let mut seq_buffer = Vec::new();
65    let mut qual_buffer = Vec::new();
66    let mut head_buffer = Vec::new();
67
68    if count {
69        // Count mode: Only count records that match the filter criteria.
70        let mut match_count = 0;
71        parallel_fastq(
72            reader,
73            num_cpus::get() as u32,
74            num_cpus::get(),
75            |record, found| {
76                // Worker thread: Apply filters on each record.
77                *found = false;
78                let seq_len_check = !check_seq_len
79                    || record.seq().len() >= minimum_sequence_length.unwrap() as usize;
80                let qual_check = !check_qual
81                    || crate::quality::average_quality(
82                        record.qual(),
83                        quality_encoding.as_deref().unwrap_or("Phred+33"),
84                    ) >= minimum_quality.unwrap();
85                let header_check =
86                    !check_header || header_regex.as_ref().unwrap().is_match(record.head());
87                let regex_check = !regex_set.is_match(record.seq());
88
89                // Mark record as matching if all conditions are met.
90                if seq_len_check && qual_check && header_check && regex_check {
91                    *found = true;
92                }
93            },
94            |_, found| {
95                // Main thread: Increment count based on the worker's flag.
96                if *found {
97                    match_count += 1;
98                }
99                None::<()>
100            },
101        )
102        .unwrap();
103        // Output the count.
104        writeln!(writer, "{}", match_count).unwrap();
105    } else {
106        // Record output mode: Write records based on the selected output format.
107        parallel_fastq(
108            reader,
109            num_cpus::get() as u32,
110            num_cpus::get(),
111            |record, found| {
112                // Worker thread: Check filter criteria.
113                *found = false;
114                let seq_len_check = !check_seq_len
115                    || record.seq().len() >= minimum_sequence_length.unwrap() as usize;
116                let qual_check = !check_qual
117                    || crate::quality::average_quality(
118                        record.qual(),
119                        quality_encoding.as_deref().unwrap_or("Phred+33"),
120                    ) >= minimum_quality.unwrap();
121                let header_check =
122                    !check_header || header_regex.as_ref().unwrap().is_match(record.head());
123                let regex_check = !regex_set.is_match(record.seq());
124
125                if seq_len_check && qual_check && header_check && regex_check {
126                    *found = true;
127                }
128            },
129            |record, found| {
130                // Main thread: Write the record in the appropriate format if it passed the filters.
131                if *found {
132                    if with_id {
133                        // Output only the record ID.
134                        write_record_with_id(
135                            &mut writer,
136                            &record,
137                            &mut head_buffer,
138                            &mut seq_buffer,
139                        );
140                    } else if with_full_record {
141                        // Output the full FASTQ record.
142                        write_full_record(
143                            &mut writer,
144                            &record,
145                            &mut head_buffer,
146                            &mut seq_buffer,
147                            &mut qual_buffer,
148                        );
149                    } else if with_fasta {
150                        // Output in FASTA format.
151                        write_record_with_fasta(
152                            &mut writer,
153                            &record,
154                            &mut head_buffer,
155                            &mut seq_buffer,
156                        );
157                    } else {
158                        // Default: output the raw sequence followed by a newline.
159                        writer.write_all(record.seq()).unwrap();
160                        writer.write_all(b"\n").unwrap();
161                    }
162                }
163                None::<()>
164            },
165        )
166        .unwrap();
167    }
168}