bustools_algorithms 0.2.0

Rust reimplementation of bustools for scRNAseq processing
Documentation
//! A sparse countmatrix: cells vs genes
//!
//! Usually the result of a bustools count call or [crate::count::count()]
//!
//! ## Example
//! ```rust, no_run
//! // load from disk
//! # use bustools_algorithms::countmatrix::CountMatrix;
//! # use std::path::Path;
//! let path = Path::new("/path/to/bus/count/folder");
//! let cmat = CountMatrix::from_disk(
//!     &path.join("gene.mtx"),
//!     &path.join("gene.barcodes.txt"),
//!     &path.join("gene.genes.txt"),
//!  );
//!  // shorter, assuming standard file names
//!  let cmat = CountMatrix::from_folder(path);
//!
//! // write to disk again
//! // note that the folder must exist already
//! let outpath = Path::new("/tmp/bustools_test_read_write");
//! if !outpath.exists() {
//!     std::fs::create_dir(outpath).unwrap();
//! }
//! cmat.write(outpath);
//! ```
use std::{
    collections::HashMap,
    fmt,
    fs::File,
    io::{BufRead, BufReader, Write}, path::Path,
};

use sprs::{
    io::{read_matrix_market, write_matrix_market}, TriMat
};

/// Countmatrix, cells-by-genes
///
/// Cells and genes are indexed via their string reprensentation
///
#[derive(Debug)]
pub struct CountMatrix {
    /// sparse count matrix
    pub matrix: sprs::CsMat<i32>,
    cbs: Vec<String>,
    genes: Vec<String>,
}
impl CountMatrix {
    /// create a CountMatrix from a sparse matrix type ([sprs::CsMat]) and name the rows (cells) and columns (genes)
    pub fn new(matrix: sprs::CsMat<i32>, cbs: Vec<String>, genes: Vec<String>) -> CountMatrix {
        CountMatrix { matrix, cbs, genes }
    }

    /// turns the count-matrix into a HashMap for easier comparision to other countmatrices
    pub fn to_map(&self) -> HashMap<(String, String), i32> {
        // transforms the sparse count matrix into a Hashmap (CB,Gene)-> count
        let mut h1: HashMap<(String, String), i32> = HashMap::new();

        for (value, (i, j)) in self.matrix.iter() {
            h1.insert((self.cbs[i].clone(), self.genes[j].clone()), *value);
        }
        h1
    }

    /// get the matrix's shape (nrows, ncols)
    pub fn get_shape(&self) -> (usize, usize) {
        self.matrix.shape()
    }

    /// load a countmatrix from disk (kallisto format: mtx + barcodes.txt + genes)
    /// 
    /// Oddly kallisto stores counts are `real` in the mmFormat (bustools v0.43.2)
    /// Hence we need to read a f32-sparse matrix and convert to ints
    pub fn from_disk(mtx_file: &Path, cbfile: &Path, genefile: &Path) -> Self {
        // load countmatrix from disk, from matrix-market format
        let mat: TriMat<f32> =
            read_matrix_market(mtx_file).unwrap_or_else(|e| panic!("cant load {:?}: {:?}", mtx_file, e));

        println!("Convertting f32 -> i32");
        // need to convert to i32
        let intdata: Vec<i32> = mat.data().iter().map(|x| x.round() as i32).collect();

        let intmat: TriMat<i32> = TriMat::from_triplets(
            mat.shape(), 
            mat.row_inds().to_vec(), 
            mat.col_inds().to_vec(), 
            intdata
        );
        println!("Done Convertting f32 -> i32");

        let matrix: sprs::CsMat<i32> = intmat.to_csr();


        let fh = File::open(cbfile).unwrap_or_else(|_| panic!("{:?} not found", cbfile));
        // Read the file line by line, and return an iterator of the lines of the file.
        let cbs: Vec<String> = BufReader::new(fh)
            .lines()
            .collect::<Result<_, _>>()
            .unwrap();

        let fh = File::open(genefile).unwrap_or_else(|_| panic!("{:?} not found", genefile));
        let genes: Vec<String> = BufReader::new(fh)
            .lines()
            .collect::<Result<_, _>>()
            .unwrap();

        CountMatrix { matrix, cbs, genes }
    }

    /// load the countmatrix from a folder, assuming standatd file naming
    pub fn from_folder(foldername: &Path) -> Self {
        let mfile =    foldername.join("gene.mtx");
        let cbfile =   foldername.join("gene.barcodes.txt");
        let genefile = foldername.join("gene.genes.txt");
        CountMatrix::from_disk(&mfile, &cbfile, &genefile)
    }

    /// write the matrix to disk in
    /// [MatrixMarket format](https://math.nist.gov/MatrixMarket/formats.html) + cell and gene metadata (just like kallisto)
    ///
    /// creates 3 files:
    /// * `gene.mtx`: the sparse matrix
    /// * `gene.barcodes.txt`: String representation fo the cell barcodes
    /// * `gene.genes.txt`: Gene names
    pub fn write(&self, foldername: &Path) {
        let mfile = foldername.join("gene.mtx");
        let cbfile = foldername.join("gene.barcodes.txt");
        let genefile = foldername.join("gene.genes.txt");


        // silly: kallisto stores `real`s in the mmFormat
        // hence we need to convert i32->f32 and write those to disk
        println!("Convertting i32 -> f32");
        let mut floatdata: Vec<f32> = Vec::new();
        let mut rows: Vec<usize> = Vec::new();
        let mut cols: Vec<usize> = Vec::new();
        for (&v, (r,c)) in self.matrix.iter() {
            floatdata.push(v as f32);
            rows.push(r);
            cols.push(c);
        }

        let fmat: TriMat<f32> = TriMat::from_triplets(
            self.matrix.shape(), 
            rows,
            cols,
            floatdata
        );
        println!("Done Convertting f32 -> i32");

        write_matrix_market(&mfile, &fmat).unwrap_or_else(|_x| panic!("{:?} not found", mfile));

        let mut fh_cb = File::create(&cbfile).unwrap_or_else(|_x| panic!("{:?} not found", cbfile));
        let mut fh_gene = File::create(&genefile).unwrap_or_else(|_x| panic!("{:?} not found", genefile));

        for cb in self.cbs.iter() {
            fh_cb.write_all(format!("{}\n", cb).as_bytes()).unwrap();
        }

        for g in self.genes.iter() {
            fh_gene.write_all(format!("{}\n", g).as_bytes()).unwrap();
        }
    }
}

impl PartialEq for CountMatrix {
    /// comparing countmatrices. True if they represnet the same cb/gene counts irrespective of ordering
    fn eq(&self, other: &Self) -> bool {
        // the tricky to be invariant to ordering: transform to hashmap
        let h1 = self.to_map();
        let h2 = other.to_map();
        h1 == h2
    }
}
impl Eq for CountMatrix {}

impl fmt::Display for CountMatrix {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "Shape: {:?};  nnz {}",
            self.get_shape(),
            self.matrix.nnz()
        )
    }
}

#[cfg(test)]
mod test {
    use super::CountMatrix;
    use crate::count2::countmap_to_matrix;
    use bustools_core::{consistent_genes::{CB, GeneId, Genename}, hashmap};
    use ndarray::arr2;
    use std::collections::HashMap;
    use tempfile::tempdir;

    #[test]
    fn test_countmatrix() {
        let countmap: HashMap<(CB, GeneId), usize> = hashmap!(
            (CB(0), GeneId(0)) => 10, 
            (CB(0), GeneId(1)) => 1, 
            (CB(1), GeneId(0)) => 0,  // lets see what happens with empty counts
            (CB(1), GeneId(1)) => 5      
        );


        let gene_vector = vec![Genename("geneA".to_string()), Genename("geneB".to_string())];

        let cmat = countmap_to_matrix(&countmap, gene_vector);

        let dense_mat = cmat.matrix.to_dense();
        let expected = arr2(&[[10_i32, 1], [0, 5]]);
        assert_eq!(dense_mat, expected);

        assert_eq!(
            cmat.cbs,
            vec![
                "AAAAAAAAAAAAAAAA".to_string(),
                "AAAAAAAAAAAAAAAC".to_string()
            ]
        );
    }

    #[test]
    fn test_read_write() {
        let countmap: HashMap<(CB, GeneId), usize> = hashmap!(
            (CB(0), GeneId(0)) => 10,
            (CB(0), GeneId(1)) => 1,
            (CB(1), GeneId(0)) => 0, // lets see what happens with empty counts
            (CB(1), GeneId(1)) => 5           
        );

        let gene_vector = vec![Genename("geneA".to_string()), Genename("geneB".to_string())];
        let cmat = countmap_to_matrix(&countmap, gene_vector);

        let dir = tempdir().unwrap();
        let path = dir.path().join("bustools_test_read_write");
        if !path.exists() {
            std::fs::create_dir(&path).unwrap();
        }

        cmat.write(&path);

        let cmat2 = CountMatrix::from_disk(
            &path.join("gene.mtx"),
            &path.join("gene.barcodes.txt"),
            &path.join("gene.genes.txt"),
        );

        assert!(cmat == cmat2);
    }

    #[test]
    fn test_countmatrix_equal() {
        //testing the Eq implementation, which should be order invariant (doesnt matter how genes are ordered)

        // two cells two genes
        // first cell has both genes
        // second cell has only the second gene
        let countmap1: HashMap<(CB, GeneId), usize> = hashmap!(
            (CB(0), GeneId(0)) => 10,
            (CB(0), GeneId(1)) =>  1,
            (CB(1), GeneId(0)) =>  0, // lets see what happens with empty counts
            (CB(1), GeneId(1)) =>  5
        );

        let gene_vector = vec![Genename("geneA".to_string()), Genename("geneB".to_string())];

        let cmat1 = countmap_to_matrix(&countmap1, gene_vector);

        // a version with permuated genes
        let countmap2: HashMap<(CB, GeneId), usize> = hashmap!(
            (CB(0), GeneId(1)) => 10,
            (CB(0), GeneId(0)) => 1,
            (CB(1), GeneId(1)) => 0, // lets see what happens with empty counts
            (CB(1), GeneId(0)) => 5  
        );

        let gene_vector = vec![Genename("geneB".to_string()), Genename("geneA".to_string())];
        let cmat2 = countmap_to_matrix(&countmap2, gene_vector);

        println!("{:?}", cmat1.to_map());
        println!("{:?}", cmat2.to_map());

        assert!(cmat1 == cmat2);
    }
}