Skip to main content

fastqc_rust/
runner.rs

1use std::io;
2use std::path::{Path, PathBuf};
3use std::sync::atomic::{AtomicBool, Ordering};
4
5use rayon::prelude::*;
6
7use crate::config::FastQCConfig;
8use crate::modules;
9use crate::report;
10use crate::sequence::casava;
11use crate::sequence::open_sequence_file;
12use crate::sequence::{SequenceFile, SequenceFileGroup};
13
14/// A unit of work: one logical sample to process through all QC modules.
15/// Contains a display name and the list of file paths that constitute it.
16struct FileGroup {
17    /// The display name for reports (CASAVA basename or original filename).
18    name: String,
19    /// The file paths in this group (usually 1, but >1 for CASAVA groups).
20    files: Vec<PathBuf>,
21}
22
23/// Run FastQC analysis on the given input files.
24///
25/// The Java OfflineRunner iterates over files, creates a
26/// SequenceFile reader for each, instantiates all QC modules, feeds every
27/// Sequence to each module, then writes the report. With --threads, files
28/// are processed in parallel via AnalysisQueue.
29pub fn run(config: &FastQCConfig, files: &[PathBuf]) -> Result<(), i32> {
30    let limits = config.load_limits().map_err(|e| {
31        eprintln!("Failed to load limits: {}", e);
32        1
33    })?;
34
35    // Validate all files exist before starting processing.
36    // For stdin, skip the existence check (Java: `filenames[0].startsWith("stdin")`).
37    // For --nano mode, expand directories to find .fast5 files within them.
38    let mut valid_files = Vec::new();
39    let mut something_failed = false;
40    for file_path in files {
41        let file_name = file_path.to_string_lossy();
42        if !file_name.starts_with("stdin") && !file_path.exists() {
43            eprintln!("{} doesn't exist", file_name);
44            something_failed = true;
45        } else if config.nano && file_path.is_dir() {
46            // In --nano mode, directories are recursively searched for .fast5 files.
47            // Matches OfflineRunner.java's directory expansion logic.
48            match find_fast5_files(file_path) {
49                Ok(fast5_files) => {
50                    if fast5_files.is_empty() {
51                        eprintln!("No .fast5 files found in {}", file_path.display());
52                        something_failed = true;
53                    } else {
54                        valid_files.extend(fast5_files);
55                    }
56                }
57                Err(e) => {
58                    eprintln!("Error scanning directory {}: {}", file_path.display(), e);
59                    something_failed = true;
60                }
61            }
62        } else {
63            valid_files.push(file_path.clone());
64        }
65    }
66
67    // Group files based on mode (casava, nano, or individual).
68    // Java's OfflineRunner.java lines 103-117 handles this branching.
69    let file_groups = build_file_groups(config, &valid_files);
70
71    // Build rayon thread pool matching --threads
72    // Java's AnalysisQueue uses a fixed thread pool of size --threads
73    let pool = rayon::ThreadPoolBuilder::new()
74        .num_threads(config.threads)
75        .build()
76        .map_err(|e| {
77            eprintln!("Failed to create thread pool: {}", e);
78            1
79        })?;
80
81    let failed = AtomicBool::new(something_failed);
82
83    pool.install(|| {
84        file_groups.par_iter().for_each(|group| {
85            if !config.quiet {
86                eprintln!("Started analysis of {}", group.name);
87            }
88
89            match process_group(config, &limits, group) {
90                Ok(()) => {
91                    if !config.quiet {
92                        eprintln!("Analysis complete for {}", group.name);
93                    }
94                }
95                Err(e) => {
96                    eprintln!("Failed to process {}: {}", group.name, e);
97                    failed.store(true, Ordering::Relaxed);
98                }
99            }
100        });
101    });
102
103    if failed.load(Ordering::Relaxed) {
104        Err(1)
105    } else {
106        Ok(())
107    }
108}
109
110/// Build file groups based on the current mode (casava, or individual files).
111///
112/// Matches the grouping logic in OfflineRunner.java lines 103-117.
113/// - If `--casava`: group by CASAVA basename
114/// - Otherwise: each file is its own group
115fn build_file_groups(config: &FastQCConfig, files: &[PathBuf]) -> Vec<FileGroup> {
116    if config.casava {
117        // CasavaBasename.getCasavaGroups() groups files by their
118        // extracted basename. Files that don't match the pattern become singletons.
119        let casava_groups = casava::get_casava_groups(files);
120        casava_groups
121            .into_iter()
122            .map(|(name, paths)| FileGroup { name, files: paths })
123            .collect()
124    } else {
125        // Default mode - each file is processed individually.
126        // Java creates `fileGroups = new File[files.size()][1]` with one file per group.
127        files
128            .iter()
129            .map(|path| {
130                let name = path
131                    .file_name()
132                    .map(|n| n.to_string_lossy().into_owned())
133                    .unwrap_or_else(|| path.to_string_lossy().into_owned());
134                FileGroup {
135                    name,
136                    files: vec![path.clone()],
137                }
138            })
139            .collect()
140    }
141}
142
143/// Process a file group (one or more files) through all QC modules and generate reports.
144///
145/// When a group has multiple files (CASAVA), they are combined
146/// into a SequenceFileGroup that reads them sequentially as one logical sample.
147fn process_group(
148    config: &FastQCConfig,
149    limits: &crate::config::Limits,
150    group: &FileGroup,
151) -> io::Result<()> {
152    // Open the sequence file(s)
153    let mut seq_file: Box<dyn SequenceFile> = if group.files.len() == 1 {
154        // Single file - open directly
155        // Uses format detection logic from SequenceFactory.java
156        open_sequence_file(config, &group.files[0])?
157    } else {
158        // Multiple files in a CASAVA group - wrap them in a
159        // SequenceFileGroup that reads all files sequentially as one stream.
160        let mut readers: Vec<Box<dyn SequenceFile>> = Vec::new();
161        for path in &group.files {
162            readers.push(open_sequence_file(config, path)?);
163        }
164        Box::new(SequenceFileGroup::new(group.name.clone(), readers))
165    };
166
167    let file_display_name = group.name.clone();
168
169    // Create module instances
170    let mut modules = modules::create_modules(config, limits);
171
172    // Set the filename on all modules (BasicStats uses it for the report)
173    for module in modules.iter_mut() {
174        module.set_filename(&file_display_name);
175    }
176
177    // Process all sequences through all modules
178    // Matches AnalysisRunner.java:64-126
179    let mut sequence_count: u64 = 0;
180    let mut last_percent: i32 = -1;
181
182    loop {
183        match seq_file.next() {
184            Some(Ok(seq)) => {
185                sequence_count += 1;
186
187                for module in modules.iter_mut() {
188                    // Skip filtered sequences for modules that request it
189                    if seq.is_filtered && module.ignore_filtered_sequences() {
190                        continue;
191                    }
192                    module.process_sequence(&seq);
193                }
194
195                // Progress reporting every 5%
196                if !config.quiet && sequence_count.is_multiple_of(1000) {
197                    let percent = seq_file.percent_complete() as i32;
198                    if percent != last_percent && percent % 5 == 0 {
199                        eprintln!("Approx {}% complete for {}", percent, file_display_name);
200                        last_percent = percent;
201                    }
202                }
203            }
204            Some(Err(e)) => {
205                return Err(io::Error::new(io::ErrorKind::InvalidData, e));
206            }
207            None => break, // EOF
208        }
209    }
210
211    // Finalize all modules (lazy computation)
212    for module in modules.iter_mut() {
213        module.finalize();
214    }
215
216    // Generate output filename
217    // For CASAVA groups, the display name is used as the base for
218    // output files. For single files, it's the filename.
219    // Strip extensions in order: .gz, .bz2, .txt, .fastq, .fq, .csfastq, .sam, .bam, .ubam
220    let base_name = strip_extensions(&file_display_name.replace("stdin:", ""));
221
222    // For output directory, use --outdir if specified, otherwise
223    // use the parent directory of the first file in the group.
224    let output_dir = if let Some(ref dir) = config.output_dir {
225        dir.clone()
226    } else {
227        group
228            .files
229            .first()
230            .and_then(|f| f.parent())
231            .unwrap_or_else(|| Path::new("."))
232            .to_path_buf()
233    };
234
235    // The Java code creates files at:
236    //   {output_dir}/{base_name}_fastqc.html  (standalone HTML)
237    //   {output_dir}/{base_name}_fastqc.zip   (zip archive)
238    let html_path = output_dir.join(format!("{}_fastqc.html", base_name));
239    let zip_path = output_dir.join(format!("{}_fastqc.zip", base_name));
240
241    // Generate HTML report as a string (used for both standalone file and zip entry)
242    let html_content =
243        report::html::generate_html_report(&modules, &file_display_name, config.template)?;
244
245    // Write standalone HTML file
246    // The Java code writes the HTML via PrintWriter after creating the zip
247    std::fs::write(&html_path, &html_content)?;
248
249    // Create zip archive, reusing the already-generated HTML content
250    report::archive::create_zip_archive(
251        &modules,
252        &file_display_name,
253        &base_name,
254        &zip_path,
255        &html_content,
256        config.svg_output,
257        config.template,
258    )?;
259
260    // Handle --extract flag
261    // If do_unzip is true, extract the zip file to the output directory.
262    // If do_unzip is None (not specified), do not extract (matches Java default).
263    if config.do_unzip == Some(true) {
264        report::archive::extract_zip(&zip_path)?;
265
266        // Handle --delete flag (only effective when --extract is also used)
267        // Matches FastQCConfig.delete_after_unzip behavior
268        if config.delete_after_unzip {
269            std::fs::remove_file(&zip_path)?;
270        }
271    }
272
273    Ok(())
274}
275
276/// Strip known sequencing file extensions from a filename.
277///
278/// Matches the exact chain of replaceAll calls in OfflineRunner.java:181
279fn strip_extensions(name: &str) -> String {
280    let mut result = name.to_string();
281    // Strip in this exact order, matching Java's replaceAll chain
282    for ext in &[
283        ".gz", ".bz2", ".txt", ".fastq", ".fq", ".csfastq", ".sam", ".bam", ".ubam", ".fast5",
284    ] {
285        if result.ends_with(ext) {
286            result = result[..result.len() - ext.len()].to_string();
287        }
288    }
289    result
290}
291
292/// Recursively find all .fast5 files within a directory.
293///
294/// In --nano mode, Java's OfflineRunner recursively searches directories
295/// for .fast5 files to process.
296fn find_fast5_files(dir: &Path) -> io::Result<Vec<PathBuf>> {
297    let mut files = Vec::new();
298    find_fast5_files_recursive(dir, &mut files)?;
299    files.sort(); // Deterministic ordering
300    Ok(files)
301}
302
303fn find_fast5_files_recursive(dir: &Path, files: &mut Vec<PathBuf>) -> io::Result<()> {
304    for entry in std::fs::read_dir(dir)? {
305        let entry = entry?;
306        let path = entry.path();
307        if path.is_dir() {
308            find_fast5_files_recursive(&path, files)?;
309        } else if path
310            .extension()
311            .is_some_and(|ext| ext.eq_ignore_ascii_case("fast5"))
312        {
313            files.push(path);
314        }
315    }
316    Ok(())
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn test_strip_extensions() {
325        assert_eq!(strip_extensions("sample.fastq"), "sample");
326        assert_eq!(strip_extensions("sample.fastq.gz"), "sample");
327        assert_eq!(strip_extensions("sample.fq.bz2"), "sample");
328        assert_eq!(strip_extensions("sample.bam"), "sample");
329        assert_eq!(strip_extensions("sample.sam"), "sample");
330        assert_eq!(strip_extensions("sample.txt.gz"), "sample");
331        assert_eq!(strip_extensions("minimal.fastq"), "minimal");
332    }
333
334    #[test]
335    fn test_build_file_groups_default() {
336        let config = FastQCConfig::default();
337        let files = vec![PathBuf::from("a.fastq"), PathBuf::from("b.fastq")];
338        let groups = build_file_groups(&config, &files);
339        assert_eq!(groups.len(), 2);
340        assert_eq!(groups[0].name, "a.fastq");
341        assert_eq!(groups[0].files.len(), 1);
342        assert_eq!(groups[1].name, "b.fastq");
343        assert_eq!(groups[1].files.len(), 1);
344    }
345
346    #[test]
347    fn test_build_file_groups_casava() {
348        let config = FastQCConfig {
349            casava: true,
350            ..FastQCConfig::default()
351        };
352        let files = vec![
353            PathBuf::from("Sample_S1_L001_R1_001.fastq.gz"),
354            PathBuf::from("Sample_S1_L001_R1_002.fastq.gz"),
355            PathBuf::from("Other_S2_L001_R1_001.fastq.gz"),
356        ];
357        let groups = build_file_groups(&config, &files);
358        assert_eq!(groups.len(), 2);
359
360        // Find the Sample group
361        let sample_group = groups
362            .iter()
363            .find(|g| g.name == "Sample_S1_L001_R1.fastq.gz")
364            .unwrap();
365        assert_eq!(sample_group.files.len(), 2);
366
367        // Find the Other group
368        let other_group = groups
369            .iter()
370            .find(|g| g.name == "Other_S2_L001_R1.fastq.gz")
371            .unwrap();
372        assert_eq!(other_group.files.len(), 1);
373    }
374
375    #[test]
376    fn test_build_file_groups_stdin() {
377        let config = FastQCConfig::default();
378        let files = vec![PathBuf::from("stdin")];
379        let groups = build_file_groups(&config, &files);
380        assert_eq!(groups.len(), 1);
381        assert_eq!(groups[0].name, "stdin");
382    }
383}