Skip to main content

fastqc_rs/
lib.rs

1#![allow(clippy::needless_range_loop)]
2
3pub mod analysis;
4pub mod charts;
5pub mod cli;
6pub mod config;
7pub mod contaminant;
8pub mod encoding;
9#[cfg(feature = "gui")]
10pub mod gui;
11pub mod modules;
12pub mod report;
13pub mod sequence;
14pub mod statistics;
15pub mod utils;
16
17use std::path::{Path, PathBuf};
18use std::sync::{Arc, RwLock};
19
20use config::FastQCConfig;
21use modules::adapter_content::AdapterContent;
22use modules::basic_stats::BasicStats;
23use modules::duplication_level::DuplicationLevel;
24use modules::kmer_content::KmerContent;
25use modules::module_config::ModuleConfig;
26use modules::n_content::NContent;
27use modules::overrepresented_seqs::{OverRepresentedSeqs, SharedDuplicationData};
28use modules::per_base_quality::PerBaseQuality;
29use modules::per_base_sequence_content::PerBaseSequenceContent;
30use modules::per_sequence_gc_content::PerSequenceGCContent;
31use modules::per_sequence_quality::PerSequenceQualityScores;
32use modules::per_tile_quality::PerTileQuality;
33use modules::sequence_length_distribution::SequenceLengthDistribution;
34use modules::QCModule;
35use sequence::bam_file;
36use sequence::fast5_file::Fast5FileReader;
37use sequence::fastq_file::FastQFile;
38
39pub const VERSION: &str = env!("CARGO_PKG_VERSION");
40
41/// Create the standard module list, matching Java ModuleFactory.getStandardModuleList().
42pub fn create_modules(config: &FastQCConfig) -> Vec<Box<dyn QCModule>> {
43    let limits = config.limits_file.as_deref();
44    let mc = || ModuleConfig::new(limits);
45
46    let shared_data = Arc::new(RwLock::new(SharedDuplicationData::default()));
47
48    let overrep = OverRepresentedSeqs::new(mc(), config.clone(), Arc::clone(&shared_data));
49    let duplication = DuplicationLevel::new(mc(), Arc::clone(&shared_data));
50
51    vec![
52        Box::new(BasicStats::new(&mc())),
53        Box::new(PerBaseQuality::new(mc(), config.clone())),
54        Box::new(PerTileQuality::new(mc(), config.clone())),
55        Box::new(PerSequenceQualityScores::new(mc())),
56        Box::new(PerBaseSequenceContent::new(mc(), config.clone())),
57        Box::new(PerSequenceGCContent::new(mc())),
58        Box::new(NContent::new(mc(), config.clone())),
59        Box::new(SequenceLengthDistribution::new(mc(), config)),
60        Box::new(duplication),
61        Box::new(overrep),
62        Box::new(AdapterContent::new(mc(), config.clone())),
63        Box::new(KmerContent::new(mc(), config.clone())),
64    ]
65}
66
67/// Run FastQC analysis on a single file and return the data report text.
68/// Detect input format from config or file extension.
69/// Matches Java SequenceFactory logic.
70pub fn detect_format(path: &Path, config: &FastQCConfig) -> String {
71    if let Some(ref fmt) = config.sequence_format {
72        return match fmt.as_str() {
73            "bam" | "sam" | "bam_mapped" | "sam_mapped" => fmt.clone(),
74            _ => "fastq".to_string(),
75        };
76    }
77    let name = path
78        .file_name()
79        .map(|n| n.to_string_lossy().to_lowercase())
80        .unwrap_or_default();
81    if name.ends_with(".bam") || name.ends_with(".ubam") {
82        "bam".to_string()
83    } else if name.ends_with(".sam") {
84        "sam".to_string()
85    } else if name.ends_with(".fast5") {
86        "fast5".to_string()
87    } else {
88        "fastq".to_string()
89    }
90}
91
92/// Per-module QC result with structured data for programmatic access.
93#[derive(Debug, Clone)]
94pub struct ModuleResult {
95    /// Module display name (e.g. "Basic Statistics")
96    pub name: String,
97    /// Pass / Warn / Fail
98    pub status: modules::QCStatus,
99    /// Tab-separated text data for this module (same as in fastqc_data.txt)
100    pub data_text: String,
101    /// Chart data for rendering, if this module produces a chart
102    pub chart_data: Option<charts::ChartData>,
103}
104
105/// Result of running FastQC on a single file or sequence set.
106#[derive(Debug)]
107pub struct FastQCReport {
108    /// The text data report (fastqc_data.txt content)
109    pub data_report: String,
110    /// The HTML report (fastqc_report.html content)
111    pub html_report: String,
112    /// FastQC `summary.txt` content.
113    pub summary_report: String,
114    /// Rendered chart images for a FastQC-compatible archive.
115    pub chart_images: Vec<ChartImage>,
116    /// Per-module structured results for programmatic access
117    pub modules: Vec<ModuleResult>,
118}
119
120/// Rendered chart image stored in the FastQC archive.
121#[derive(Debug, Clone)]
122pub struct ChartImage {
123    /// Archive filename, usually under `Images/` when written to a FastQC zip.
124    pub filename: String,
125    /// MIME type for the rendered chart, e.g. `image/png` or `image/svg+xml`.
126    pub mime_type: String,
127    /// Encoded image bytes.
128    pub bytes: Vec<u8>,
129}
130
131#[derive(Debug, Clone)]
132struct AnalysisInput {
133    paths: Vec<PathBuf>,
134    report_name: String,
135    output_stem: String,
136}
137
138fn strip_fastqc_suffixes(name: &str) -> String {
139    let mut stem = name.to_string();
140    for suffix in [
141        ".gz", ".bz2", ".txt", ".fastq", ".fq", ".csfastq", ".sam", ".bam", ".ubam",
142    ] {
143        if stem.to_lowercase().ends_with(suffix) {
144            let new_len = stem.len() - suffix.len();
145            stem.truncate(new_len);
146        }
147    }
148    stem
149}
150
151fn file_name_string(path: &Path) -> String {
152    path.file_name()
153        .map(|n| n.to_string_lossy().to_string())
154        .unwrap_or_else(|| "unknown".to_string())
155}
156
157fn casava_basename(name: &str) -> Option<String> {
158    if name.ends_with(".fastq.gz") && name.len() >= 13 {
159        let marker = name.len() - 13;
160        let digits_start = name.len() - 12;
161        let digits_end = name.len() - 9;
162        if &name[marker..marker + 1] == "_"
163            && name[digits_start..digits_end]
164                .chars()
165                .all(|c| c.is_ascii_digit())
166        {
167            return Some(format!("{}.fastq.gz", &name[..marker]));
168        }
169    } else if name.ends_with(".fastq") && name.len() >= 10 {
170        let marker = name.len() - 10;
171        let digits_start = name.len() - 9;
172        let digits_end = name.len() - 6;
173        if &name[marker..marker + 1] == "_"
174            && name[digits_start..digits_end]
175                .chars()
176                .all(|c| c.is_ascii_digit())
177        {
178            return Some(format!("{}.fastq", &name[..marker]));
179        }
180    }
181
182    None
183}
184
185fn nanopore_basename(name: &str) -> Option<String> {
186    let without_ext = name.strip_suffix(".fast5").unwrap_or(name);
187    let parts: Vec<&str> = without_ext.split('_').collect();
188    if parts.len() < 3 {
189        None
190    } else {
191        Some(format!("{}_{}_{}", parts[0], parts[1], parts[2]))
192    }
193}
194
195fn collect_analysis_inputs(
196    files: &[String],
197    config: &FastQCConfig,
198) -> Result<Vec<AnalysisInput>, Box<dyn std::error::Error>> {
199    if files.len() == 1 && files[0].starts_with("stdin") {
200        return Ok(vec![AnalysisInput {
201            paths: Vec::new(),
202            report_name: "stdin".to_string(),
203            output_stem: "stdin".to_string(),
204        }]);
205    }
206
207    let mut paths = Vec::new();
208    for file in files {
209        let path = PathBuf::from(file);
210        if !path.exists() {
211            eprintln!("Skipping {} - file not found", file);
212            continue;
213        }
214
215        if config.nano && path.is_dir() {
216            for entry in std::fs::read_dir(&path)? {
217                let entry = entry?;
218                let entry_path = entry.path();
219                if entry_path.is_file() && file_name_string(&entry_path).ends_with(".fast5") {
220                    paths.push(entry_path);
221                } else if entry_path.is_dir() {
222                    for sub_entry in std::fs::read_dir(&entry_path)? {
223                        let sub_path = sub_entry?.path();
224                        if sub_path.is_file() && file_name_string(&sub_path).ends_with(".fast5") {
225                            paths.push(sub_path);
226                        }
227                    }
228                }
229            }
230        } else {
231            paths.push(path);
232        }
233    }
234
235    if config.casava {
236        let mut groups = std::collections::BTreeMap::<String, Vec<PathBuf>>::new();
237        for path in paths {
238            let name = file_name_string(&path);
239            let key = if let Some(base) = casava_basename(&name) {
240                base
241            } else {
242                eprintln!("File '{}' didn't look like part of a CASAVA group", name);
243                name
244            };
245            groups.entry(key).or_default().push(path);
246        }
247
248        Ok(groups
249            .into_iter()
250            .map(|(report_name, paths)| AnalysisInput {
251                output_stem: strip_fastqc_suffixes(&report_name),
252                report_name,
253                paths,
254            })
255            .collect())
256    } else if config.nano {
257        let mut groups = std::collections::BTreeMap::<String, Vec<PathBuf>>::new();
258        for path in paths {
259            let name = file_name_string(&path);
260            if name.contains("muxscan") {
261                continue;
262            }
263            let key = nanopore_basename(&name).unwrap_or_else(|| {
264                eprintln!("File '{}' didn't look like part of a nanopore group", name);
265                name.clone()
266            });
267            groups.entry(key).or_default().push(path);
268        }
269
270        Ok(groups
271            .into_iter()
272            .map(|(report_name, paths)| AnalysisInput {
273                output_stem: strip_fastqc_suffixes(&report_name),
274                report_name,
275                paths,
276            })
277            .collect())
278    } else {
279        Ok(paths
280            .into_iter()
281            .map(|path| {
282                let report_name = file_name_string(&path);
283                AnalysisInput {
284                    output_stem: strip_fastqc_suffixes(&report_name),
285                    report_name,
286                    paths: vec![path],
287                }
288            })
289            .collect())
290    }
291}
292
293/// Run FastQC analysis on a single file and return both reports.
294pub fn run_fastqc_on_file(
295    path: &Path,
296    config: &FastQCConfig,
297) -> Result<FastQCReport, Box<dyn std::error::Error>> {
298    let file_name = file_name_string(path);
299    run_fastqc_on_paths(&[path.to_path_buf()], &file_name, config)
300}
301
302fn run_fastqc_on_paths(
303    paths: &[PathBuf],
304    report_name: &str,
305    config: &FastQCConfig,
306) -> Result<FastQCReport, Box<dyn std::error::Error>> {
307    if !config.quiet {
308        eprintln!("Started analysis of {}", report_name);
309    }
310
311    let mut modules = create_modules(config);
312    let mut count = 0u64;
313    let sequence_report_name = if paths.len() > 1 {
314        paths
315            .first()
316            .map(|path| file_name_string(path))
317            .unwrap_or_else(|| report_name.to_string())
318    } else {
319        report_name.to_string()
320    };
321
322    for path in paths {
323        let format = detect_format(path, config);
324        let only_mapped = format == "bam_mapped" || format == "sam_mapped";
325        let sequence_report_name = sequence_report_name.clone();
326
327        count += match format.as_str() {
328            "fast5" => {
329                let fast5 = Fast5FileReader::open(path)?;
330                let sequences = fast5.map(move |r| {
331                    r.map(|mut seq| {
332                        seq.file_name = sequence_report_name.clone();
333                        seq
334                    })
335                    .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
336                });
337                analysis::run_analysis(sequences, &mut modules, config.quiet, config.min_length)?
338            }
339            "bam" | "bam_mapped" => {
340                let bam = bam_file::BamFileReader::open(path, only_mapped)?;
341                let sequences = bam.map(move |r| {
342                    r.map(|mut seq| {
343                        seq.file_name = sequence_report_name.clone();
344                        seq
345                    })
346                    .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
347                });
348                analysis::run_analysis(sequences, &mut modules, config.quiet, config.min_length)?
349            }
350            "sam" | "sam_mapped" => {
351                let sam = bam_file::SamFileReader::open(path, only_mapped)?;
352                let sequences = sam.map(move |r| {
353                    r.map(|mut seq| {
354                        seq.file_name = sequence_report_name.clone();
355                        seq
356                    })
357                    .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
358                });
359                analysis::run_analysis(sequences, &mut modules, config.quiet, config.min_length)?
360            }
361            _ => {
362                let fq = FastQFile::open(path, config.casava, config.nofilter)?;
363                let sequences = fq.map(move |r| {
364                    r.map(|mut seq| {
365                        seq.file_name = sequence_report_name.clone();
366                        seq
367                    })
368                    .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
369                });
370                analysis::run_analysis(sequences, &mut modules, config.quiet, config.min_length)?
371            }
372        };
373    }
374
375    if !config.quiet {
376        eprintln!(
377            "Analysis complete for {} ({} sequences)",
378            report_name, count
379        );
380    }
381
382    let data_report = report::generate_data_report(&mut modules, VERSION);
383    let summary_report = report::generate_summary_report(&mut modules, report_name);
384    let html_report = report::generate_html_report(
385        &mut modules,
386        report_name,
387        VERSION,
388        config.svg_output,
389        config.embed_images,
390    );
391    let chart_images = collect_chart_images(&mut modules, config.svg_output);
392
393    // Extract structured per-module results
394    let module_results: Vec<ModuleResult> = modules
395        .iter_mut()
396        .filter(|m| !m.ignore_in_report())
397        .map(|m| {
398            let mut data_text = String::new();
399            m.make_data_report(&mut data_text);
400            ModuleResult {
401                name: m.name().to_string(),
402                status: m.status(),
403                data_text,
404                chart_data: m.chart_data(),
405            }
406        })
407        .collect();
408
409    Ok(FastQCReport {
410        data_report,
411        html_report,
412        summary_report,
413        chart_images,
414        modules: module_results,
415    })
416}
417
418fn collect_chart_images(modules: &mut [Box<dyn QCModule>], svg_output: bool) -> Vec<ChartImage> {
419    modules
420        .iter_mut()
421        .filter_map(|module| {
422            let chart_data = module.chart_data()?;
423            let mut filename = chart_data.image_filename(module.name());
424            let (mime_type, bytes) = if svg_output {
425                filename = filename
426                    .strip_suffix(".png")
427                    .map(|stem| format!("{}.svg", stem))
428                    .unwrap_or_else(|| format!("{}.svg", filename));
429                (
430                    "image/svg+xml".to_string(),
431                    charts::render_chart_to_svg(&chart_data).ok()?,
432                )
433            } else {
434                (
435                    "image/png".to_string(),
436                    charts::render_chart_to_png(&chart_data).ok()?,
437                )
438            };
439
440            Some(ChartImage {
441                filename,
442                mime_type,
443                bytes,
444            })
445        })
446        .collect()
447}
448
449/// Process a single FastQC input group: analyze, write reports.
450fn process_analysis_input(
451    input: &AnalysisInput,
452    config: &FastQCConfig,
453) -> Result<(), Box<dyn std::error::Error>> {
454    let report = run_fastqc_on_paths(&input.paths, &input.report_name, config)?;
455    let stem = &input.output_stem;
456
457    let output_dir = config.output_dir.as_deref().unwrap_or_else(|| {
458        input
459            .paths
460            .first()
461            .and_then(|path| path.parent())
462            .unwrap_or(Path::new("."))
463    });
464
465    // Write top-level HTML report
466    let html_path = output_dir.join(format!("{}_fastqc.html", stem));
467    std::fs::write(&html_path, &report.html_report)?;
468
469    // Create ZIP archive containing the FastQC report folder
470    let zip_path = output_dir.join(format!("{}_fastqc.zip", stem));
471    report::write_fastqc_archive(
472        &zip_path,
473        stem,
474        &report.data_report,
475        &report.html_report,
476        &report.summary_report,
477        &report.chart_images,
478    )?;
479
480    // Handle --extract: unzip the archive
481    if config.do_unzip {
482        let extract_dir = output_dir.join(format!("{}_fastqc", stem));
483        std::fs::create_dir_all(&extract_dir)?;
484        std::fs::create_dir_all(extract_dir.join("Icons"))?;
485        std::fs::create_dir_all(extract_dir.join("Images"))?;
486        std::fs::write(extract_dir.join("fastqc_data.txt"), &report.data_report)?;
487        std::fs::write(extract_dir.join("fastqc_report.html"), &report.html_report)?;
488        std::fs::write(extract_dir.join("summary.txt"), &report.summary_report)?;
489        for (name, bytes) in report::fastqc_icon_files() {
490            std::fs::write(extract_dir.join("Icons").join(name), bytes)?;
491        }
492        for image in &report.chart_images {
493            std::fs::write(
494                extract_dir.join("Images").join(&image.filename),
495                &image.bytes,
496            )?;
497        }
498
499        // Handle --delete: remove ZIP after extraction
500        if config.delete_after_unzip {
501            std::fs::remove_file(&zip_path)?;
502        }
503    }
504
505    if !config.quiet {
506        eprintln!("Report written to {}", html_path.display());
507    }
508
509    Ok(())
510}
511
512/// Run FastQC analysis on the given files with the provided configuration.
513/// Uses rayon thread pool when --threads > 1 for parallel file processing.
514pub fn run_fastqc(
515    files: &[String],
516    config: &FastQCConfig,
517) -> Result<(), Box<dyn std::error::Error>> {
518    if !config.quiet {
519        eprintln!("fastqc-compliant-rs v{}", VERSION);
520    }
521
522    // Handle stdin specially when it is the only input, matching FastQC.
523    if files.len() == 1 && files[0].starts_with("stdin") {
524        // Process stdin sequentially (can't parallelize stdin)
525        let fq = FastQFile::from_stdin(config.casava, config.nofilter)?;
526        let mut modules = create_modules(config);
527        let sequences = fq.map(|r| {
528            r.map(|mut seq| {
529                seq.file_name = "stdin".to_string();
530                seq
531            })
532            .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
533        });
534        analysis::run_analysis(sequences, &mut modules, config.quiet, config.min_length)?;
535        let data_report = report::generate_data_report(&mut modules, VERSION);
536        let summary_report = report::generate_summary_report(&mut modules, "stdin");
537        let html_report = report::generate_html_report(
538            &mut modules,
539            "stdin",
540            VERSION,
541            config.svg_output,
542            config.embed_images,
543        );
544        let chart_images = collect_chart_images(&mut modules, config.svg_output);
545
546        let output_dir = config.output_dir.as_deref().unwrap_or(Path::new("."));
547        let html_path = output_dir.join("stdin_fastqc.html");
548        std::fs::write(&html_path, &html_report)?;
549        let zip_path = output_dir.join("stdin_fastqc.zip");
550        report::write_fastqc_archive(
551            &zip_path,
552            "stdin",
553            &data_report,
554            &html_report,
555            &summary_report,
556            &chart_images,
557        )?;
558        if config.do_unzip {
559            let extract_dir = output_dir.join("stdin_fastqc");
560            std::fs::create_dir_all(&extract_dir)?;
561            std::fs::create_dir_all(extract_dir.join("Icons"))?;
562            std::fs::create_dir_all(extract_dir.join("Images"))?;
563            std::fs::write(extract_dir.join("fastqc_data.txt"), &data_report)?;
564            std::fs::write(extract_dir.join("fastqc_report.html"), &html_report)?;
565            std::fs::write(extract_dir.join("summary.txt"), &summary_report)?;
566            for (name, bytes) in report::fastqc_icon_files() {
567                std::fs::write(extract_dir.join("Icons").join(name), bytes)?;
568            }
569            for image in &chart_images {
570                std::fs::write(
571                    extract_dir.join("Images").join(&image.filename),
572                    &image.bytes,
573                )?;
574            }
575            if config.delete_after_unzip {
576                std::fs::remove_file(&zip_path)?;
577            }
578        }
579        if !config.quiet {
580            eprintln!("Report written to {}", html_path.display());
581        }
582        return Ok(());
583    }
584
585    let inputs = collect_analysis_inputs(files, config)?;
586
587    if config.threads > 1 {
588        // Parallel processing with rayon
589        let pool = rayon::ThreadPoolBuilder::new()
590            .num_threads(config.threads)
591            .build()
592            .map_err(|e| format!("Failed to create thread pool: {}", e))?;
593
594        let errors: Vec<String> = pool.install(|| {
595            use rayon::prelude::*;
596            inputs
597                .par_iter()
598                .filter_map(|input| {
599                    if let Err(e) = process_analysis_input(input, config) {
600                        Some(format!("{}: {}", input.report_name, e))
601                    } else {
602                        None
603                    }
604                })
605                .collect()
606        });
607
608        if !errors.is_empty() {
609            for err in &errors {
610                eprintln!("Error: {}", err);
611            }
612            return Err(format!("{} file(s) failed to process", errors.len()).into());
613        }
614    } else {
615        // Sequential processing
616        for input in &inputs {
617            process_analysis_input(input, config)?;
618        }
619    }
620
621    Ok(())
622}
623
624/// Public API for running QC on in-memory sequences.
625pub struct FastQCRunner {
626    config: FastQCConfig,
627}
628
629impl FastQCRunner {
630    pub fn new(config: FastQCConfig) -> Self {
631        Self { config }
632    }
633
634    /// Run QC on an iterator of sequences and return structured results.
635    pub fn run_sequences(
636        &self,
637        sequences: impl Iterator<Item = sequence::Sequence>,
638    ) -> Result<FastQCReport, Box<dyn std::error::Error>> {
639        let mut modules = create_modules(&self.config);
640        let sequences = sequences.map(Ok::<_, Box<dyn std::error::Error>>);
641        analysis::run_analysis(
642            sequences,
643            &mut modules,
644            self.config.quiet,
645            self.config.min_length,
646        )?;
647        let data_report = report::generate_data_report(&mut modules, VERSION);
648        let summary_report = report::generate_summary_report(&mut modules, "sequences");
649        let html_report = report::generate_html_report(
650            &mut modules,
651            "sequences",
652            VERSION,
653            self.config.svg_output,
654            self.config.embed_images,
655        );
656        let chart_images = collect_chart_images(&mut modules, self.config.svg_output);
657
658        let module_results: Vec<ModuleResult> = modules
659            .iter_mut()
660            .filter(|m| !m.ignore_in_report())
661            .map(|m| {
662                let mut data_text = String::new();
663                m.make_data_report(&mut data_text);
664                ModuleResult {
665                    name: m.name().to_string(),
666                    status: m.status(),
667                    data_text,
668                    chart_data: m.chart_data(),
669                }
670            })
671            .collect();
672
673        Ok(FastQCReport {
674            data_report,
675            html_report,
676            summary_report,
677            chart_images,
678            modules: module_results,
679        })
680    }
681
682    /// Run QC on a file path and return both reports.
683    pub fn run_file(&self, path: &Path) -> Result<FastQCReport, Box<dyn std::error::Error>> {
684        run_fastqc_on_file(path, &self.config)
685    }
686}
687
688#[cfg(test)]
689mod tests {
690    use super::*;
691
692    fn tempdir() -> PathBuf {
693        let nanos = std::time::SystemTime::now()
694            .duration_since(std::time::UNIX_EPOCH)
695            .unwrap()
696            .as_nanos();
697        let dir =
698            std::env::temp_dir().join(format!("fastqc_rs_lib_{}_{}", std::process::id(), nanos));
699        std::fs::create_dir_all(&dir).unwrap();
700        dir
701    }
702
703    #[test]
704    fn test_strip_fastqc_suffixes() {
705        for (input, expected) in [
706            ("reads.fastq", "reads"),
707            ("reads.fq", "reads"),
708            ("reads.fastq.gz", "reads"),
709            ("reads.fq.gz", "reads"),
710            ("reads.fastq.bz2", "reads"),
711            ("reads.fq.bz2", "reads"),
712            ("reads.txt", "reads"),
713            ("reads.csfastq", "reads"),
714            ("reads.sam", "reads"),
715            ("reads.bam", "reads"),
716            ("reads.ubam", "reads"),
717        ] {
718            assert_eq!(strip_fastqc_suffixes(input), expected);
719        }
720    }
721
722    #[test]
723    fn test_casava_basename_parsing() {
724        assert_eq!(
725            casava_basename("sample_001.fastq.gz"),
726            Some("sample.fastq.gz".to_string())
727        );
728        assert_eq!(
729            casava_basename("sample_123.fastq"),
730            Some("sample.fastq".to_string())
731        );
732        assert_eq!(casava_basename("sample.fastq.gz"), None);
733        assert_eq!(casava_basename("sample_ABC.fastq"), None);
734    }
735
736    #[test]
737    fn test_nanopore_basename_parsing() {
738        assert_eq!(
739            nanopore_basename("Computer_Sample_42_ch100_file7_strand.fast5"),
740            Some("Computer_Sample_42".to_string())
741        );
742        assert_eq!(
743            nanopore_basename("Computer_Sample_42.fast5"),
744            Some("Computer_Sample_42".to_string())
745        );
746        assert_eq!(nanopore_basename("short.fast5"), None);
747    }
748
749    #[test]
750    fn test_nanopore_directory_scan_depth_and_muxscan_filter() {
751        let dir = tempdir();
752        let nested = dir.join("nested");
753        let deep = nested.join("deep");
754        std::fs::create_dir(&nested).unwrap();
755        std::fs::create_dir(&deep).unwrap();
756
757        std::fs::write(dir.join("Run_Sample_001_ch1_file1_strand.fast5"), b"").unwrap();
758        std::fs::write(nested.join("Run_Sample_001_ch1_file2_strand.fast5"), b"").unwrap();
759        std::fs::write(nested.join("Run_Sample_001_muxscan.fast5"), b"").unwrap();
760        std::fs::write(deep.join("Run_Sample_001_ch1_file3_strand.fast5"), b"").unwrap();
761
762        let config = FastQCConfig {
763            nano: true,
764            ..Default::default()
765        };
766        let inputs =
767            collect_analysis_inputs(&[dir.to_string_lossy().to_string()], &config).unwrap();
768
769        assert_eq!(inputs.len(), 1);
770        assert_eq!(inputs[0].report_name, "Run_Sample_001");
771        assert_eq!(inputs[0].paths.len(), 2);
772        assert!(inputs[0]
773            .paths
774            .iter()
775            .all(|path| !file_name_string(path).contains("muxscan")));
776        assert!(inputs[0]
777            .paths
778            .iter()
779            .all(|path| path.parent() != Some(deep.as_path())));
780    }
781
782    #[test]
783    fn test_casava_invalid_names_become_singletons() {
784        let dir = tempdir();
785        let grouped_a = dir.join("sample_001.fastq");
786        let grouped_b = dir.join("sample_002.fastq");
787        let singleton = dir.join("other.fastq");
788        for path in [&grouped_a, &grouped_b, &singleton] {
789            std::fs::write(path, b"").unwrap();
790        }
791
792        let config = FastQCConfig {
793            casava: true,
794            ..Default::default()
795        };
796        let inputs = collect_analysis_inputs(
797            &[
798                grouped_a.to_string_lossy().to_string(),
799                grouped_b.to_string_lossy().to_string(),
800                singleton.to_string_lossy().to_string(),
801            ],
802            &config,
803        )
804        .unwrap();
805
806        let grouped = inputs
807            .iter()
808            .find(|input| input.report_name == "sample.fastq")
809            .unwrap();
810        let single = inputs
811            .iter()
812            .find(|input| input.report_name == "other.fastq")
813            .unwrap();
814
815        assert_eq!(grouped.paths.len(), 2);
816        assert_eq!(single.paths.len(), 1);
817    }
818}