use std::collections::HashMap;
use std::path::PathBuf;
#[derive(Debug)]
pub struct NameFormatError;
impl std::fmt::Display for NameFormatError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Filename does not match CASAVA naming convention")
}
}
impl std::error::Error for NameFormatError {}
pub fn get_casava_basename(original_name: &str) -> Result<String, NameFormatError> {
if original_name.ends_with(".fastq.gz") {
let len = original_name.len();
if len >= 13
&& &original_name[len - 13..len - 12] == "_"
&& original_name[len - 12..len - 9].parse::<u32>().is_ok()
{
let base_name = format!("{}.fastq.gz", &original_name[..len - 13]);
return Ok(base_name);
}
} else if original_name.ends_with(".fastq") {
let len = original_name.len();
if len >= 10
&& &original_name[len - 10..len - 9] == "_"
&& original_name[len - 9..len - 6].parse::<u32>().is_ok()
{
let base_name = format!("{}.fastq", &original_name[..len - 10]);
return Ok(base_name);
}
}
Err(NameFormatError)
}
pub fn get_casava_groups(files: &[PathBuf]) -> Vec<(String, Vec<PathBuf>)> {
let mut groups: HashMap<String, Vec<PathBuf>> = HashMap::new();
let mut order: Vec<String> = Vec::new();
for file in files {
let file_name = file
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_default();
match get_casava_basename(&file_name) {
Ok(base_name) => {
if !groups.contains_key(&base_name) {
order.push(base_name.clone());
}
groups.entry(base_name).or_default().push(file.clone());
}
Err(_) => {
eprintln!(
"File '{}' didn't look like part of a CASAVA group",
file_name
);
order.push(file_name.clone());
groups.entry(file_name).or_default().push(file.clone());
}
}
}
order
.into_iter()
.filter_map(|key| groups.remove(&key).map(|files| (key, files)))
.collect()
}
pub fn get_nanopore_basename(original_name: &str) -> Result<String, NameFormatError> {
let stripped = original_name
.strip_suffix(".fast5")
.unwrap_or(original_name);
let sub_names: Vec<&str> = stripped.split('_').collect();
if sub_names.len() < 3 {
return Err(NameFormatError);
}
let basename = format!("{}_{}_{}", sub_names[0], sub_names[1], sub_names[2]);
eprintln!("Basename is {}", basename);
Ok(basename)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_casava_basename_fastq_gz() {
assert_eq!(
get_casava_basename("SampleA_S1_L001_R1_001.fastq.gz").unwrap(),
"SampleA_S1_L001_R1.fastq.gz"
);
}
#[test]
fn test_casava_basename_fastq() {
assert_eq!(
get_casava_basename("SampleA_S1_L001_R1_001.fastq").unwrap(),
"SampleA_S1_L001_R1.fastq"
);
}
#[test]
fn test_casava_basename_different_chunk() {
assert_eq!(
get_casava_basename("SampleA_S1_L001_R1_042.fastq.gz").unwrap(),
"SampleA_S1_L001_R1.fastq.gz"
);
}
#[test]
fn test_casava_basename_non_casava() {
assert!(get_casava_basename("sample.fastq.gz").is_err());
assert!(get_casava_basename("sample.bam").is_err());
}
#[test]
fn test_casava_basename_not_digits() {
assert!(get_casava_basename("sample_abc.fastq.gz").is_err());
}
#[test]
fn test_casava_groups() {
let files = vec![
PathBuf::from("SampleA_S1_L001_R1_001.fastq.gz"),
PathBuf::from("SampleA_S1_L001_R1_002.fastq.gz"),
PathBuf::from("SampleB_S2_L001_R1_001.fastq.gz"),
PathBuf::from("non_casava.fastq.gz"),
];
let groups = get_casava_groups(&files);
assert_eq!(groups.len(), 3);
let sample_a = groups
.iter()
.find(|(name, _)| name == "SampleA_S1_L001_R1.fastq.gz")
.unwrap();
assert_eq!(sample_a.1.len(), 2);
let sample_b = groups
.iter()
.find(|(name, _)| name == "SampleB_S2_L001_R1.fastq.gz")
.unwrap();
assert_eq!(sample_b.1.len(), 1);
}
#[test]
fn test_nanopore_basename() {
assert_eq!(
get_nanopore_basename("Computer_Sample_123_ch100_file0_strand.fast5").unwrap(),
"Computer_Sample_123"
);
}
#[test]
fn test_nanopore_basename_short() {
assert_eq!(
get_nanopore_basename("Computer_Sample_123.fast5").unwrap(),
"Computer_Sample_123"
);
}
#[test]
fn test_nanopore_basename_too_few() {
assert!(get_nanopore_basename("Computer_Sample.fast5").is_err());
}
}