use anyhow::Result;
use flate2::read::GzDecoder;
use rustc_hash::FxHashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct BarcodeProcessor {
ordered_barcodes: Arc<Vec<String>>,
barcode_to_id: Arc<FxHashMap<String, u32>>,
}
impl BarcodeProcessor {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let reader: Box<dyn BufRead> = if path
.extension()
.and_then(|s| s.to_str())
.map(|s| s.eq_ignore_ascii_case("gz"))
.unwrap_or(false)
{
Box::new(BufReader::with_capacity(256 * 1024, GzDecoder::new(file)))
} else {
Box::new(BufReader::with_capacity(256 * 1024, file))
};
Self::from_reader(reader)
}
#[inline]
pub fn is_valid(&self, barcode: &str) -> bool {
self.barcode_to_id.contains_key(barcode)
}
#[inline]
pub fn id_of(&self, barcode: &str) -> Option<u32> {
self.barcode_to_id.get(barcode).copied()
}
#[inline]
pub fn barcode_by_id(&self, id: u32) -> Option<&str> {
self.ordered_barcodes.get(id as usize).map(|s| s.as_str())
}
pub fn len(&self) -> usize {
self.ordered_barcodes.len()
}
pub fn is_empty(&self) -> bool {
self.ordered_barcodes.is_empty()
}
pub fn barcodes(&self) -> Vec<String> {
self.ordered_barcodes.as_ref().clone()
}
pub fn ordered_barcodes(&self) -> &[String] {
self.ordered_barcodes.as_ref()
}
fn from_reader(reader: Box<dyn BufRead>) -> Result<Self> {
let mut ordered = Vec::with_capacity(1024);
let mut index = FxHashMap::default();
let mut line = String::with_capacity(64);
for result in reader.lines() {
line.clear();
match result {
Ok(line_content) => {
let barcode = line_content.trim();
if !barcode.is_empty() {
let clean_barcode = barcode.split('-').next().unwrap_or(barcode);
if !index.contains_key(clean_barcode) {
let id = ordered.len() as u32;
ordered.push(clean_barcode.to_string());
index.insert(clean_barcode.to_string(), id);
}
}
}
Err(e) => return Err(e.into()),
}
}
ordered.shrink_to_fit();
index.shrink_to_fit();
Ok(BarcodeProcessor {
ordered_barcodes: Arc::new(ordered),
barcode_to_id: Arc::new(index),
})
}
pub fn from_vec(barcodes: Vec<String>) -> Self {
let mut index = FxHashMap::with_capacity_and_hasher(barcodes.len(), Default::default());
for (i, barcode) in barcodes.iter().enumerate() {
index.insert(barcode.clone(), i as u32);
}
BarcodeProcessor {
ordered_barcodes: Arc::new(barcodes),
barcode_to_id: Arc::new(index),
}
}
}
#[cfg(test)]
mod tests {
use super::BarcodeProcessor;
use flate2::{write::GzEncoder, Compression};
use std::fs::File;
use std::io::Write;
use tempfile::tempdir;
#[test]
fn from_vec_preserves_order_and_lookup() {
let processor = BarcodeProcessor::from_vec(vec![
"AAACCTG".to_string(),
"TTTGCAA".to_string(),
"GGGATTA".to_string(),
]);
assert_eq!(processor.len(), 3);
assert!(!processor.is_empty());
assert_eq!(processor.id_of("AAACCTG"), Some(0));
assert_eq!(processor.id_of("TTTGCAA"), Some(1));
assert_eq!(processor.id_of("MISSING"), None);
assert_eq!(processor.barcode_by_id(2), Some("GGGATTA"));
assert_eq!(processor.barcode_by_id(3), None);
}
#[test]
fn from_file_normalizes_and_deduplicates() {
let dir = tempdir().unwrap();
let path = dir.path().join("barcodes.tsv");
let mut file = File::create(&path).unwrap();
writeln!(file, "AAACCTG-1").unwrap();
writeln!(file, "AAACCTG-2").unwrap();
writeln!(file, "").unwrap();
writeln!(file, "TTTGCAA").unwrap();
writeln!(file, " GGGATTA-9 ").unwrap();
drop(file);
let processor = BarcodeProcessor::from_file(&path).unwrap();
assert_eq!(processor.ordered_barcodes(), &["AAACCTG", "TTTGCAA", "GGGATTA"]);
assert_eq!(processor.id_of("AAACCTG"), Some(0));
assert_eq!(processor.id_of("TTTGCAA"), Some(1));
assert_eq!(processor.id_of("GGGATTA"), Some(2));
assert!(processor.is_valid("AAACCTG"));
assert!(!processor.is_valid("AAACCTG-1"));
}
#[test]
fn from_gz_file_is_supported() {
let dir = tempdir().unwrap();
let path = dir.path().join("barcodes.tsv.gz");
let file = File::create(&path).unwrap();
let mut encoder = GzEncoder::new(file, Compression::default());
writeln!(encoder, "CELLA-1").unwrap();
writeln!(encoder, "CELLB-1").unwrap();
encoder.finish().unwrap();
let processor = BarcodeProcessor::from_file(&path).unwrap();
assert_eq!(processor.len(), 2);
assert_eq!(processor.id_of("CELLA"), Some(0));
assert_eq!(processor.id_of("CELLB"), Some(1));
}
}