Skip to main content

fastqc_rust/sequence/
casava.rs

1// CASAVA basename extraction and file grouping
2// Corresponds to Utilities/CasavaBasename.java
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7/// Error returned when a filename does not match the CASAVA naming convention.
8///
9/// Mirrors `Utilities.NameFormatException` in Java.
10#[derive(Debug)]
11pub struct NameFormatError;
12
13impl std::fmt::Display for NameFormatError {
14    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15        write!(f, "Filename does not match CASAVA naming convention")
16    }
17}
18
19impl std::error::Error for NameFormatError {}
20
21/// Extract the CASAVA basename from a filename.
22///
23/// CASAVA filenames follow the pattern:
24///   `{SampleName}_{Barcode}_{Lane}_{Read}_{FlowCellChunkNumber}.fastq.gz`
25///
26/// This strips the `_{3digits}` before the `.fastq[.gz]` extension, yielding the
27/// sample group name. Files sharing the same basename are treated as one logical sample.
28///
29/// This is a direct translation of `CasavaBasename.getCasavaBasename()`.
30/// The Java code uses character-position arithmetic rather than regex to parse
31/// the trailing `_NNN.fastq[.gz]` pattern. We replicate the exact same index
32/// math to ensure identical grouping behavior.
33pub fn get_casava_basename(original_name: &str) -> Result<String, NameFormatError> {
34    // The Java code checks two cases: `.fastq.gz` and `.fastq`.
35    // For `.fastq.gz`: expects `_` at position len-13, then 3 digits at len-12..len-9.
36    // For `.fastq`:    expects `_` at position len-10, then 3 digits at len-9..len-6.
37
38    if original_name.ends_with(".fastq.gz") {
39        let len = original_name.len();
40        // Check for '_' 13 chars before the end (i.e. before "NNN.fastq.gz")
41        if len >= 13
42            && &original_name[len - 13..len - 12] == "_"
43            && original_name[len - 12..len - 9].parse::<u32>().is_ok()
44        {
45            // basename = everything before the underscore + ".fastq.gz"
46            let base_name = format!("{}.fastq.gz", &original_name[..len - 13]);
47            return Ok(base_name);
48        }
49    } else if original_name.ends_with(".fastq") {
50        let len = original_name.len();
51        // Check for '_' 10 chars before the end (i.e. before "NNN.fastq")
52        if len >= 10
53            && &original_name[len - 10..len - 9] == "_"
54            && original_name[len - 9..len - 6].parse::<u32>().is_ok()
55        {
56            // basename = everything before the underscore + ".fastq"
57            let base_name = format!("{}.fastq", &original_name[..len - 10]);
58            return Ok(base_name);
59        }
60    }
61
62    Err(NameFormatError)
63}
64
65/// Group files by their CASAVA basename.
66///
67/// Returns a `Vec` of `(basename, files)` tuples. Files whose names do not
68/// conform to the CASAVA pattern are placed in their own singleton group.
69///
70/// Direct translation of `CasavaBasename.getCasavaGroups()`.
71/// When a file does not match, Java prints a warning to stderr and adds
72/// it as a singleton group keyed by the raw filename.
73pub fn get_casava_groups(files: &[PathBuf]) -> Vec<(String, Vec<PathBuf>)> {
74    // Java uses a Hashtable (unordered) for grouping. We use
75    // an IndexMap-style approach with a Vec to preserve insertion order, but
76    // the Java code does not guarantee any ordering either. Using a HashMap
77    // here matches the Java Hashtable semantics.
78    let mut groups: HashMap<String, Vec<PathBuf>> = HashMap::new();
79    let mut order: Vec<String> = Vec::new();
80
81    for file in files {
82        let file_name = file
83            .file_name()
84            .map(|n| n.to_string_lossy().into_owned())
85            .unwrap_or_default();
86
87        match get_casava_basename(&file_name) {
88            Ok(base_name) => {
89                if !groups.contains_key(&base_name) {
90                    order.push(base_name.clone());
91                }
92                groups.entry(base_name).or_default().push(file.clone());
93            }
94            Err(_) => {
95                // Java prints warning and adds as singleton
96                eprintln!(
97                    "File '{}' didn't look like part of a CASAVA group",
98                    file_name
99                );
100                order.push(file_name.clone());
101                groups.entry(file_name).or_default().push(file.clone());
102            }
103        }
104    }
105
106    order
107        .into_iter()
108        .filter_map(|key| groups.remove(&key).map(|files| (key, files)))
109        .collect()
110}
111
112// ---------------------------------------------------------------------------
113// Nanopore basename extraction
114// ---------------------------------------------------------------------------
115
116/// Extract the Nanopore basename from a filename.
117///
118/// Nanopore filenames follow the pattern:
119///   `Computer_Samplename_number[_chXXX_fileXXX_strand].fast5`
120///
121/// This extracts the first three underscore-separated components as the group name.
122///
123/// Direct translation of `NanoporeBasename.getNanoporeBasename()`.
124/// The Java code splits on `_` after stripping `.fast5`, requires at least 3
125/// parts, and joins the first three with underscores.
126pub fn get_nanopore_basename(original_name: &str) -> Result<String, NameFormatError> {
127    // Java does `originalName.replaceAll(".fast5$", "").split("_")`
128    let stripped = original_name
129        .strip_suffix(".fast5")
130        .unwrap_or(original_name);
131    let sub_names: Vec<&str> = stripped.split('_').collect();
132
133    if sub_names.len() < 3 {
134        return Err(NameFormatError);
135    }
136
137    // Java joins first 3 components: `subNames[0]+"_"+subNames[1]+"_"+subNames[2]`
138    let basename = format!("{}_{}_{}", sub_names[0], sub_names[1], sub_names[2]);
139
140    // Java prints basename to stderr for debugging
141    eprintln!("Basename is {}", basename);
142
143    Ok(basename)
144}
145
146// ---------------------------------------------------------------------------
147// Tests
148// ---------------------------------------------------------------------------
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    // ---- CASAVA basename tests ----
155
156    #[test]
157    fn test_casava_basename_fastq_gz() {
158        // Standard CASAVA filename with .fastq.gz
159        assert_eq!(
160            get_casava_basename("SampleA_S1_L001_R1_001.fastq.gz").unwrap(),
161            "SampleA_S1_L001_R1.fastq.gz"
162        );
163    }
164
165    #[test]
166    fn test_casava_basename_fastq() {
167        // CASAVA filename without compression
168        assert_eq!(
169            get_casava_basename("SampleA_S1_L001_R1_001.fastq").unwrap(),
170            "SampleA_S1_L001_R1.fastq"
171        );
172    }
173
174    #[test]
175    fn test_casava_basename_different_chunk() {
176        assert_eq!(
177            get_casava_basename("SampleA_S1_L001_R1_042.fastq.gz").unwrap(),
178            "SampleA_S1_L001_R1.fastq.gz"
179        );
180    }
181
182    #[test]
183    fn test_casava_basename_non_casava() {
184        // Non-CASAVA filenames should fail
185        assert!(get_casava_basename("sample.fastq.gz").is_err());
186        assert!(get_casava_basename("sample.bam").is_err());
187    }
188
189    #[test]
190    fn test_casava_basename_not_digits() {
191        // The 3 chars before .fastq must be parseable as integer
192        assert!(get_casava_basename("sample_abc.fastq.gz").is_err());
193    }
194
195    #[test]
196    fn test_casava_groups() {
197        let files = vec![
198            PathBuf::from("SampleA_S1_L001_R1_001.fastq.gz"),
199            PathBuf::from("SampleA_S1_L001_R1_002.fastq.gz"),
200            PathBuf::from("SampleB_S2_L001_R1_001.fastq.gz"),
201            PathBuf::from("non_casava.fastq.gz"),
202        ];
203        let groups = get_casava_groups(&files);
204
205        // Should have 3 groups: SampleA, SampleB, and the non-casava singleton
206        assert_eq!(groups.len(), 3);
207
208        // Find the SampleA group
209        let sample_a = groups
210            .iter()
211            .find(|(name, _)| name == "SampleA_S1_L001_R1.fastq.gz")
212            .unwrap();
213        assert_eq!(sample_a.1.len(), 2);
214
215        // Find the SampleB group
216        let sample_b = groups
217            .iter()
218            .find(|(name, _)| name == "SampleB_S2_L001_R1.fastq.gz")
219            .unwrap();
220        assert_eq!(sample_b.1.len(), 1);
221    }
222
223    // ---- Nanopore basename tests ----
224
225    #[test]
226    fn test_nanopore_basename() {
227        assert_eq!(
228            get_nanopore_basename("Computer_Sample_123_ch100_file0_strand.fast5").unwrap(),
229            "Computer_Sample_123"
230        );
231    }
232
233    #[test]
234    fn test_nanopore_basename_short() {
235        // Newer format with just 3 components
236        assert_eq!(
237            get_nanopore_basename("Computer_Sample_123.fast5").unwrap(),
238            "Computer_Sample_123"
239        );
240    }
241
242    #[test]
243    fn test_nanopore_basename_too_few() {
244        assert!(get_nanopore_basename("Computer_Sample.fast5").is_err());
245    }
246}