redicat 0.4.2

REDICAT - RNA Editing Cellular Assessment Toolkit: A highly parallelized utility for analyzing RNA editing events in single-cell RNA-seq data
Documentation
//! Cell barcode processing functionality

use anyhow::Result;
use flate2::read::GzDecoder;
use rustc_hash::FxHashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::sync::Arc;

/// Processor for cell barcodes
#[derive(Debug, Clone)]
pub struct BarcodeProcessor {
    ordered_barcodes: Arc<Vec<String>>,
    barcode_to_id: Arc<FxHashMap<String, u32>>,
}

impl BarcodeProcessor {
    /// Create a new BarcodeProcessor from a file containing barcodes
    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file = File::open(path)?;
        let reader: Box<dyn BufRead> = if path
            .extension()
            .and_then(|s| s.to_str())
            .map(|s| s.eq_ignore_ascii_case("gz"))
            .unwrap_or(false)
        {
            Box::new(BufReader::with_capacity(256 * 1024, GzDecoder::new(file)))
        } else {
            Box::new(BufReader::with_capacity(256 * 1024, file))
        };

        Self::from_reader(reader)
    }

    /// Check if a barcode is valid
    #[inline]
    pub fn is_valid(&self, barcode: &str) -> bool {
        self.barcode_to_id.contains_key(barcode)
    }

    /// Lookup the numeric identifier for a barcode if present.
    #[inline]
    pub fn id_of(&self, barcode: &str) -> Option<u32> {
        self.barcode_to_id.get(barcode).copied()
    }

    /// Retrieve a barcode string by numeric identifier.
    #[inline]
    pub fn barcode_by_id(&self, id: u32) -> Option<&str> {
        self.ordered_barcodes.get(id as usize).map(|s| s.as_str())
    }

    /// Get the number of valid barcodes
    pub fn len(&self) -> usize {
        self.ordered_barcodes.len()
    }

    /// Check if the barcode set is empty
    pub fn is_empty(&self) -> bool {
        self.ordered_barcodes.is_empty()
    }

    /// Get all valid barcodes (preserving whitelist order)
    pub fn barcodes(&self) -> Vec<String> {
        self.ordered_barcodes.as_ref().clone()
    }

    /// Borrow the ordered whitelist without cloning.
    pub fn ordered_barcodes(&self) -> &[String] {
        self.ordered_barcodes.as_ref()
    }

    fn from_reader(reader: Box<dyn BufRead>) -> Result<Self> {
        let mut ordered = Vec::with_capacity(1024);
        let mut index = FxHashMap::default();
        let mut line = String::with_capacity(64);

        for result in reader.lines() {
            line.clear();
            match result {
                Ok(line_content) => {
                    let barcode = line_content.trim();
                    if !barcode.is_empty() {
                        let clean_barcode = barcode.split('-').next().unwrap_or(barcode);
                        if !index.contains_key(clean_barcode) {
                            let id = ordered.len() as u32;
                            ordered.push(clean_barcode.to_string());
                            index.insert(clean_barcode.to_string(), id);
                        }
                    }
                }
                Err(e) => return Err(e.into()),
            }
        }

        ordered.shrink_to_fit();
        index.shrink_to_fit();
        Ok(BarcodeProcessor {
            ordered_barcodes: Arc::new(ordered),
            barcode_to_id: Arc::new(index),
        })
    }

    /// Construct a processor from an explicit list of barcodes, preserving order.
    pub fn from_vec(barcodes: Vec<String>) -> Self {
        let mut index = FxHashMap::with_capacity_and_hasher(barcodes.len(), Default::default());
        for (i, barcode) in barcodes.iter().enumerate() {
            index.insert(barcode.clone(), i as u32);
        }

        BarcodeProcessor {
            ordered_barcodes: Arc::new(barcodes),
            barcode_to_id: Arc::new(index),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::BarcodeProcessor;
    use flate2::{write::GzEncoder, Compression};
    use std::fs::File;
    use std::io::Write;
    use tempfile::tempdir;

    #[test]
    fn from_vec_preserves_order_and_lookup() {
        let processor = BarcodeProcessor::from_vec(vec![
            "AAACCTG".to_string(),
            "TTTGCAA".to_string(),
            "GGGATTA".to_string(),
        ]);

        assert_eq!(processor.len(), 3);
        assert!(!processor.is_empty());
        assert_eq!(processor.id_of("AAACCTG"), Some(0));
        assert_eq!(processor.id_of("TTTGCAA"), Some(1));
        assert_eq!(processor.id_of("MISSING"), None);
        assert_eq!(processor.barcode_by_id(2), Some("GGGATTA"));
        assert_eq!(processor.barcode_by_id(3), None);
    }

    #[test]
    fn from_file_normalizes_and_deduplicates() {
        let dir = tempdir().unwrap();
        let path = dir.path().join("barcodes.tsv");
        let mut file = File::create(&path).unwrap();

        writeln!(file, "AAACCTG-1").unwrap();
        writeln!(file, "AAACCTG-2").unwrap();
        writeln!(file, "").unwrap();
        writeln!(file, "TTTGCAA").unwrap();
        writeln!(file, "  GGGATTA-9  ").unwrap();
        drop(file);

        let processor = BarcodeProcessor::from_file(&path).unwrap();

        assert_eq!(processor.ordered_barcodes(), &["AAACCTG", "TTTGCAA", "GGGATTA"]);
        assert_eq!(processor.id_of("AAACCTG"), Some(0));
        assert_eq!(processor.id_of("TTTGCAA"), Some(1));
        assert_eq!(processor.id_of("GGGATTA"), Some(2));
        assert!(processor.is_valid("AAACCTG"));
        assert!(!processor.is_valid("AAACCTG-1"));
    }

    #[test]
    fn from_gz_file_is_supported() {
        let dir = tempdir().unwrap();
        let path = dir.path().join("barcodes.tsv.gz");
        let file = File::create(&path).unwrap();
        let mut encoder = GzEncoder::new(file, Compression::default());
        writeln!(encoder, "CELLA-1").unwrap();
        writeln!(encoder, "CELLB-1").unwrap();
        encoder.finish().unwrap();

        let processor = BarcodeProcessor::from_file(&path).unwrap();
        assert_eq!(processor.len(), 2);
        assert_eq!(processor.id_of("CELLA"), Some(0));
        assert_eq!(processor.id_of("CELLB"), Some(1));
    }
}