fastqc_rust/sequence/casava.rs
1// CASAVA basename extraction and file grouping
2// Corresponds to Utilities/CasavaBasename.java
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7/// Error returned when a filename does not match the CASAVA naming convention.
8///
9/// Mirrors `Utilities.NameFormatException` in Java.
10#[derive(Debug)]
11pub struct NameFormatError;
12
13impl std::fmt::Display for NameFormatError {
14 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15 write!(f, "Filename does not match CASAVA naming convention")
16 }
17}
18
19impl std::error::Error for NameFormatError {}
20
21/// Extract the CASAVA basename from a filename.
22///
23/// CASAVA filenames follow the pattern:
24/// `{SampleName}_{Barcode}_{Lane}_{Read}_{FlowCellChunkNumber}.fastq.gz`
25///
26/// This strips the `_{3digits}` before the `.fastq[.gz]` extension, yielding the
27/// sample group name. Files sharing the same basename are treated as one logical sample.
28///
29/// This is a direct translation of `CasavaBasename.getCasavaBasename()`.
30/// The Java code uses character-position arithmetic rather than regex to parse
31/// the trailing `_NNN.fastq[.gz]` pattern. We replicate the exact same index
32/// math to ensure identical grouping behavior.
33pub fn get_casava_basename(original_name: &str) -> Result<String, NameFormatError> {
34 // The Java code checks two cases: `.fastq.gz` and `.fastq`.
35 // For `.fastq.gz`: expects `_` at position len-13, then 3 digits at len-12..len-9.
36 // For `.fastq`: expects `_` at position len-10, then 3 digits at len-9..len-6.
37
38 if original_name.ends_with(".fastq.gz") {
39 let len = original_name.len();
40 // Check for '_' 13 chars before the end (i.e. before "NNN.fastq.gz")
41 if len >= 13
42 && &original_name[len - 13..len - 12] == "_"
43 && original_name[len - 12..len - 9].parse::<u32>().is_ok()
44 {
45 // basename = everything before the underscore + ".fastq.gz"
46 let base_name = format!("{}.fastq.gz", &original_name[..len - 13]);
47 return Ok(base_name);
48 }
49 } else if original_name.ends_with(".fastq") {
50 let len = original_name.len();
51 // Check for '_' 10 chars before the end (i.e. before "NNN.fastq")
52 if len >= 10
53 && &original_name[len - 10..len - 9] == "_"
54 && original_name[len - 9..len - 6].parse::<u32>().is_ok()
55 {
56 // basename = everything before the underscore + ".fastq"
57 let base_name = format!("{}.fastq", &original_name[..len - 10]);
58 return Ok(base_name);
59 }
60 }
61
62 Err(NameFormatError)
63}
64
65/// Group files by their CASAVA basename.
66///
67/// Returns a `Vec` of `(basename, files)` tuples. Files whose names do not
68/// conform to the CASAVA pattern are placed in their own singleton group.
69///
70/// Direct translation of `CasavaBasename.getCasavaGroups()`.
71/// When a file does not match, Java prints a warning to stderr and adds
72/// it as a singleton group keyed by the raw filename.
73pub fn get_casava_groups(files: &[PathBuf]) -> Vec<(String, Vec<PathBuf>)> {
74 // Java uses a Hashtable (unordered) for grouping. We use
75 // an IndexMap-style approach with a Vec to preserve insertion order, but
76 // the Java code does not guarantee any ordering either. Using a HashMap
77 // here matches the Java Hashtable semantics.
78 let mut groups: HashMap<String, Vec<PathBuf>> = HashMap::new();
79 let mut order: Vec<String> = Vec::new();
80
81 for file in files {
82 let file_name = file
83 .file_name()
84 .map(|n| n.to_string_lossy().into_owned())
85 .unwrap_or_default();
86
87 match get_casava_basename(&file_name) {
88 Ok(base_name) => {
89 if !groups.contains_key(&base_name) {
90 order.push(base_name.clone());
91 }
92 groups.entry(base_name).or_default().push(file.clone());
93 }
94 Err(_) => {
95 // Java prints warning and adds as singleton
96 eprintln!(
97 "File '{}' didn't look like part of a CASAVA group",
98 file_name
99 );
100 order.push(file_name.clone());
101 groups.entry(file_name).or_default().push(file.clone());
102 }
103 }
104 }
105
106 order
107 .into_iter()
108 .filter_map(|key| groups.remove(&key).map(|files| (key, files)))
109 .collect()
110}
111
112// ---------------------------------------------------------------------------
113// Nanopore basename extraction
114// ---------------------------------------------------------------------------
115
116/// Extract the Nanopore basename from a filename.
117///
118/// Nanopore filenames follow the pattern:
119/// `Computer_Samplename_number[_chXXX_fileXXX_strand].fast5`
120///
121/// This extracts the first three underscore-separated components as the group name.
122///
123/// Direct translation of `NanoporeBasename.getNanoporeBasename()`.
124/// The Java code splits on `_` after stripping `.fast5`, requires at least 3
125/// parts, and joins the first three with underscores.
126pub fn get_nanopore_basename(original_name: &str) -> Result<String, NameFormatError> {
127 // Java does `originalName.replaceAll(".fast5$", "").split("_")`
128 let stripped = original_name
129 .strip_suffix(".fast5")
130 .unwrap_or(original_name);
131 let sub_names: Vec<&str> = stripped.split('_').collect();
132
133 if sub_names.len() < 3 {
134 return Err(NameFormatError);
135 }
136
137 // Java joins first 3 components: `subNames[0]+"_"+subNames[1]+"_"+subNames[2]`
138 let basename = format!("{}_{}_{}", sub_names[0], sub_names[1], sub_names[2]);
139
140 // Java prints basename to stderr for debugging
141 eprintln!("Basename is {}", basename);
142
143 Ok(basename)
144}
145
146// ---------------------------------------------------------------------------
147// Tests
148// ---------------------------------------------------------------------------
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153
154 // ---- CASAVA basename tests ----
155
156 #[test]
157 fn test_casava_basename_fastq_gz() {
158 // Standard CASAVA filename with .fastq.gz
159 assert_eq!(
160 get_casava_basename("SampleA_S1_L001_R1_001.fastq.gz").unwrap(),
161 "SampleA_S1_L001_R1.fastq.gz"
162 );
163 }
164
165 #[test]
166 fn test_casava_basename_fastq() {
167 // CASAVA filename without compression
168 assert_eq!(
169 get_casava_basename("SampleA_S1_L001_R1_001.fastq").unwrap(),
170 "SampleA_S1_L001_R1.fastq"
171 );
172 }
173
174 #[test]
175 fn test_casava_basename_different_chunk() {
176 assert_eq!(
177 get_casava_basename("SampleA_S1_L001_R1_042.fastq.gz").unwrap(),
178 "SampleA_S1_L001_R1.fastq.gz"
179 );
180 }
181
182 #[test]
183 fn test_casava_basename_non_casava() {
184 // Non-CASAVA filenames should fail
185 assert!(get_casava_basename("sample.fastq.gz").is_err());
186 assert!(get_casava_basename("sample.bam").is_err());
187 }
188
189 #[test]
190 fn test_casava_basename_not_digits() {
191 // The 3 chars before .fastq must be parseable as integer
192 assert!(get_casava_basename("sample_abc.fastq.gz").is_err());
193 }
194
195 #[test]
196 fn test_casava_groups() {
197 let files = vec![
198 PathBuf::from("SampleA_S1_L001_R1_001.fastq.gz"),
199 PathBuf::from("SampleA_S1_L001_R1_002.fastq.gz"),
200 PathBuf::from("SampleB_S2_L001_R1_001.fastq.gz"),
201 PathBuf::from("non_casava.fastq.gz"),
202 ];
203 let groups = get_casava_groups(&files);
204
205 // Should have 3 groups: SampleA, SampleB, and the non-casava singleton
206 assert_eq!(groups.len(), 3);
207
208 // Find the SampleA group
209 let sample_a = groups
210 .iter()
211 .find(|(name, _)| name == "SampleA_S1_L001_R1.fastq.gz")
212 .unwrap();
213 assert_eq!(sample_a.1.len(), 2);
214
215 // Find the SampleB group
216 let sample_b = groups
217 .iter()
218 .find(|(name, _)| name == "SampleB_S2_L001_R1.fastq.gz")
219 .unwrap();
220 assert_eq!(sample_b.1.len(), 1);
221 }
222
223 // ---- Nanopore basename tests ----
224
225 #[test]
226 fn test_nanopore_basename() {
227 assert_eq!(
228 get_nanopore_basename("Computer_Sample_123_ch100_file0_strand.fast5").unwrap(),
229 "Computer_Sample_123"
230 );
231 }
232
233 #[test]
234 fn test_nanopore_basename_short() {
235 // Newer format with just 3 components
236 assert_eq!(
237 get_nanopore_basename("Computer_Sample_123.fast5").unwrap(),
238 "Computer_Sample_123"
239 );
240 }
241
242 #[test]
243 fn test_nanopore_basename_too_few() {
244 assert!(get_nanopore_basename("Computer_Sample.fast5").is_err());
245 }
246}