use std::collections::{HashMap, HashSet};
use core::fmt;
use rayon::prelude::*;
use mapping_info::MappingInfo;
use crate::cell_data::CellData;
use crate::cell_data::GeneUmiHash;
use crate::{FeatureIndex, MatrixValueType};
pub struct Scdata {
pub(crate) data: [HashMap<u64, CellData>; u8::MAX as usize + 1],
pub(crate) feature_ids_with_data: Vec<u64>,
pub(crate) total_feature_data_entries: usize,
pub(crate) export_cell_ids: Vec<u64>,
checked: bool,
pub num_threads: usize,
pub(crate) value_type: MatrixValueType,
}
impl Default for Scdata {
fn default() -> Self {
Self::new(1, MatrixValueType::Integer)
}
}
impl fmt::Display for Scdata {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Scdata Summary")?;
writeln!(f, "===============")?;
writeln!(f, "Checked: {}", self.checked)?;
writeln!(f, "Export Cells: {}", self.export_cell_ids.len())?;
writeln!(f, "Matrix Value Type: {:?}", self.value_type)?;
writeln!(
f,
"Features with Data: [{} entries]",
self.feature_ids_with_data.len()
)?;
let total_cells: usize = self.data.iter().map(|m| m.len()).sum();
let non_empty_buckets = self.data.iter().filter(|m| !m.is_empty()).count();
writeln!(f, "Total Cells: {}", total_cells)?;
writeln!(f, "Non-empty Buckets: {}", non_empty_buckets)?;
Ok(())
}
}
impl Scdata {
pub fn new(num_threads: usize, value_type: MatrixValueType) -> Self {
let data = std::array::from_fn(|_| HashMap::<u64, CellData>::new());
Self {
data,
feature_ids_with_data: Vec::new(),
total_feature_data_entries: 0,
export_cell_ids: Vec::new(),
checked: false,
num_threads,
value_type,
}
}
pub fn value_type(&mut self, value_type: Option<MatrixValueType>) -> &MatrixValueType {
if let Some(val) = value_type {
self.value_type = val;
}
&self.value_type
}
#[inline]
fn to_key(&self, name: &u64) -> usize {
(*name >> 56) as usize
}
fn invalidate_export_cache(&mut self) {
self.checked = false;
self.export_cell_ids.clear();
self.feature_ids_with_data.clear();
self.total_feature_data_entries = 0;
}
pub(crate) fn values(&self) -> impl Iterator<Item = &CellData> {
self.data.iter().flat_map(|map| map.values())
}
pub fn keys(&self) -> Vec<u64> {
self.data
.iter()
.flat_map(|map| map.keys())
.copied()
.collect()
}
pub fn is_empty(&self) -> bool {
self.data.iter().all(|m| m.is_empty())
}
pub fn len(&self) -> usize {
self.data.iter().map(|m| m.len()).sum()
}
pub fn get(&self, key: &u64) -> Option<&CellData> {
let index = self.to_key(key);
self.data[index].get(key)
}
pub fn merge(&mut self, other: &Scdata) {
if other.is_empty() {
return;
}
self.invalidate_export_cache();
self.data
.par_iter_mut()
.enumerate()
.for_each(|(index, self_bucket)| {
if let Some(other_bucket) = other.data.get(index) {
for (cell_name, other_cell) in other_bucket {
match self_bucket.entry(*cell_name) {
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().merge(other_cell);
}
std::collections::hash_map::Entry::Vacant(entry) => {
entry.insert(other_cell.clone());
}
}
}
}
});
}
pub fn merge_single_thread(&mut self, mut other: Scdata) {
if other.is_empty() {
return;
}
self.invalidate_export_cache();
for other_cell in other.data.iter_mut().flat_map(|map| map.values_mut()) {
let index = self.to_key(&other_cell.name);
match self.data[index].entry(other_cell.name) {
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().merge(other_cell);
}
std::collections::hash_map::Entry::Vacant(entry) => {
entry.insert(std::mem::take(other_cell));
}
}
}
}
pub fn try_insert(
&mut self,
name: &u64,
data: GeneUmiHash,
_value: f32,
report: &mut MappingInfo,
) -> bool {
let index = self.to_key(name);
self.invalidate_export_cache();
let cell_info = self.data[index]
.entry(*name)
.or_insert_with(|| CellData::new(*name));
report.ok_reads += 1;
if !cell_info.add(data) {
report.pcr_duplicates += 1;
report.local_dup += 1;
false
} else {
true
}
}
pub fn try_insert_value(
&mut self,
name: &u64,
data: GeneUmiHash,
value: f32,
report: &mut MappingInfo,
) -> bool {
let index = self.to_key(name);
self.invalidate_export_cache();
let cell_info = self.data[index]
.entry(*name)
.or_insert_with(|| CellData::new(*name));
report.ok_reads += 1;
cell_info.add_value(data, value)
}
pub(crate) fn check_sparse_export_ready(&self) -> Result<(), String> {
if !self.checked {
return Err("Sparse export requires finalize_for_export(...) first.".to_string());
}
if self.export_cell_ids.is_empty() {
return Err("Sparse export failed: no export cells available.".to_string());
}
Ok(())
}
pub fn passing_cells(&self) -> usize {
self.export_cell_ids.len()
}
pub fn cell_ids(&self) -> HashSet<u64> {
self.data
.iter()
.flat_map(|bucket| bucket.keys().copied())
.collect()
}
fn passing_cell_set_by_umi(&self, min_count: usize) -> HashSet<u64> {
let keys = self.keys();
if keys.is_empty() {
return HashSet::new();
}
let n_threads = self.num_threads.max(1);
let chunk_size = keys.len() / n_threads + 1;
let passing: Vec<u64> = keys
.par_chunks(chunk_size)
.flat_map(|chunk| {
let mut keep = Vec::<u64>::with_capacity(chunk.len());
for key in chunk {
if let Some(cell) = self.get(key)
&& cell.total_umis() >= min_count
{
keep.push(*key);
}
}
keep
})
.collect();
passing.into_iter().collect()
}
fn rebuild_feature_ids_with_data<I: FeatureIndex>(&mut self, index: &I) {
let (observed_feature_ids, total_entries) = self
.data
.par_iter()
.map(|bucket| {
let mut local_ids = HashSet::with_capacity(64);
let mut local_entries = 0usize;
for cell in bucket.values() {
for feature_id in cell.total_reads.keys() {
local_ids.insert(*feature_id);
}
local_entries += cell.total_reads.len();
}
(local_ids, local_entries)
})
.reduce(
|| (HashSet::new(), 0usize),
|(mut ids_a, n_a), (ids_b, n_b)| {
ids_a.extend(ids_b);
(ids_a, n_a + n_b)
},
);
let feature_ids_with_data = index
.ordered_feature_ids()
.into_iter()
.filter(|fid| observed_feature_ids.contains(fid))
.collect();
self.feature_ids_with_data = feature_ids_with_data;
self.total_feature_data_entries = total_entries;
}
fn restrict_to_cells(&mut self, keep: &HashSet<u64>) {
for bucket in &mut self.data {
bucket.retain(|cell_id, _| keep.contains(cell_id));
}
let mut export_cell_ids: Vec<u64> = self
.data
.iter()
.flat_map(|bucket| bucket.keys().copied())
.collect();
export_cell_ids.sort_unstable();
self.export_cell_ids = export_cell_ids;
}
pub fn finalize_for_export<I: FeatureIndex>(&mut self, min_total_umis: usize, index: &I) {
let keep = self.passing_cell_set_by_umi(min_total_umis);
self.restrict_to_cells(&keep);
self.rebuild_feature_ids_with_data(index);
self.checked = true;
}
pub fn export_cell_ids(&self) -> &[u64] {
&self.export_cell_ids
}
}