use crate::constants::{MAX_READS, RC_FLAG_BIT, READ_INDEX_MASK};
#[derive(Debug)]
pub struct QueryInvertedIndex {
pub(crate) entries: Vec<(u64, u32)>,
pub(crate) fwd_counts: Vec<u32>,
pub(crate) rc_counts: Vec<u32>,
unique_count: usize,
}
impl QueryInvertedIndex {
#[inline]
pub fn pack_read_id(read_idx: u32, is_rc: bool) -> u32 {
debug_assert!(read_idx <= READ_INDEX_MASK, "Read index exceeds 31 bits");
if is_rc {
read_idx | RC_FLAG_BIT
} else {
read_idx
}
}
#[inline]
pub fn unpack_read_id(packed: u32) -> (u32, bool) {
let is_rc = (packed & RC_FLAG_BIT) != 0;
let read_idx = packed & READ_INDEX_MASK;
(read_idx, is_rc)
}
pub fn num_entries(&self) -> usize {
self.entries.len()
}
pub fn num_reads(&self) -> usize {
self.fwd_counts.len()
}
pub fn fwd_count(&self, read_idx: usize) -> u32 {
self.fwd_counts[read_idx]
}
pub fn rc_count(&self, read_idx: usize) -> u32 {
self.rc_counts[read_idx]
}
pub fn num_unique_minimizers(&self) -> usize {
self.unique_count
}
pub fn unique_minimizers(&self) -> Vec<u64> {
let mut result = Vec::with_capacity(self.unique_count);
for &(m, _) in &self.entries {
if result.last() != Some(&m) {
result.push(m);
}
}
result
}
pub fn minimizer_range(&self) -> Option<(u64, u64)> {
if self.entries.is_empty() {
None
} else {
Some((self.entries[0].0, self.entries[self.entries.len() - 1].0))
}
}
fn compute_unique_count(entries: &[(u64, u32)]) -> usize {
if entries.is_empty() {
0
} else {
entries.windows(2).filter(|w| w[0].0 != w[1].0).count() + 1
}
}
pub const MAX_READS: usize = MAX_READS;
pub fn build(queries: &[(Vec<u64>, Vec<u64>)]) -> Self {
assert!(
queries.len() <= Self::MAX_READS,
"Batch size {} exceeds maximum {} reads (31-bit limit)",
queries.len(),
Self::MAX_READS
);
if queries.is_empty() {
return QueryInvertedIndex {
entries: Vec::new(),
fwd_counts: Vec::new(),
rc_counts: Vec::new(),
unique_count: 0,
};
}
let mut total_entries = 0usize;
let mut fwd_counts = Vec::with_capacity(queries.len());
let mut rc_counts = Vec::with_capacity(queries.len());
for (fwd, rc) in queries {
total_entries = total_entries
.checked_add(fwd.len())
.and_then(|t| t.checked_add(rc.len()))
.expect("Total minimizer count overflow");
fwd_counts.push(fwd.len() as u32);
rc_counts.push(rc.len() as u32);
}
if total_entries == 0 {
return QueryInvertedIndex {
entries: Vec::new(),
fwd_counts,
rc_counts,
unique_count: 0,
};
}
let mut entries: Vec<(u64, u32)> = Vec::with_capacity(total_entries);
for (read_idx, (fwd, rc)) in queries.iter().enumerate() {
let read_idx = read_idx as u32;
for &m in fwd {
entries.push((m, Self::pack_read_id(read_idx, false)));
}
for &m in rc {
entries.push((m, Self::pack_read_id(read_idx, true)));
}
}
entries.sort_unstable_by_key(|(m, _)| *m);
let unique_count = Self::compute_unique_count(&entries);
QueryInvertedIndex {
entries,
fwd_counts,
rc_counts,
unique_count,
}
}
pub fn from_sorted_coo(
entries: Vec<(u64, u32)>,
fwd_counts: Vec<u32>,
rc_counts: Vec<u32>,
) -> Self {
debug_assert!(
entries.windows(2).all(|w| w[0].0 <= w[1].0),
"from_sorted_coo: entries must be sorted by minimizer"
);
let unique_count = Self::compute_unique_count(&entries);
QueryInvertedIndex {
entries,
fwd_counts,
rc_counts,
unique_count,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pack_unpack_read_id() {
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(0, false)),
(0, false)
);
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(12345, false)),
(12345, false)
);
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(0, true)),
(0, true)
);
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(12345, true)),
(12345, true)
);
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(0x7FFFFFFF, false)),
(0x7FFFFFFF, false)
);
assert_eq!(
QueryInvertedIndex::unpack_read_id(QueryInvertedIndex::pack_read_id(0x7FFFFFFF, true)),
(0x7FFFFFFF, true)
);
}
#[test]
fn test_query_inverted_empty() {
let queries: Vec<(Vec<u64>, Vec<u64>)> = vec![];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.num_entries(), 0);
assert_eq!(qidx.unique_minimizers().len(), 0);
assert_eq!(qidx.num_reads(), 0);
}
#[test]
fn test_query_inverted_single_read() {
let queries = vec![(vec![100, 200, 300], vec![150, 250])];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.unique_minimizers().len(), 5);
assert_eq!(qidx.num_entries(), 5);
assert_eq!(qidx.num_reads(), 1);
assert_eq!(qidx.unique_minimizers(), &[100, 150, 200, 250, 300]);
assert_eq!(qidx.fwd_counts[0], 3);
assert_eq!(qidx.rc_counts[0], 2);
}
#[test]
fn test_query_inverted_overlapping_minimizers() {
let queries = vec![
(vec![100, 200], vec![150]), (vec![100, 300], vec![150]), ];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.unique_minimizers().len(), 4);
assert_eq!(qidx.num_entries(), 6);
assert_eq!(qidx.num_reads(), 2);
assert_eq!(qidx.unique_minimizers(), &[100, 150, 200, 300]);
let count_100 = qidx.entries.iter().filter(|(m, _)| *m == 100).count();
assert_eq!(count_100, 2);
let reads_for_100: Vec<(u32, bool)> = qidx
.entries
.iter()
.filter(|(m, _)| *m == 100)
.map(|(_, p)| QueryInvertedIndex::unpack_read_id(*p))
.collect();
assert!(reads_for_100.contains(&(0, false)));
assert!(reads_for_100.contains(&(1, false)));
let reads_for_150: Vec<(u32, bool)> = qidx
.entries
.iter()
.filter(|(m, _)| *m == 150)
.map(|(_, p)| QueryInvertedIndex::unpack_read_id(*p))
.collect();
assert!(reads_for_150.contains(&(0, true)));
assert!(reads_for_150.contains(&(1, true)));
}
#[test]
fn test_query_inverted_fwd_rc_counts() {
let queries = vec![
(vec![100, 200, 300], vec![150, 250]), (vec![100], vec![150, 250, 350, 450]), (vec![], vec![500, 600]), ];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.fwd_counts.len(), 3);
assert_eq!(qidx.rc_counts.len(), 3);
assert_eq!(qidx.fwd_counts[0], 3);
assert_eq!(qidx.rc_counts[0], 2);
assert_eq!(qidx.fwd_counts[1], 1);
assert_eq!(qidx.rc_counts[1], 4);
assert_eq!(qidx.fwd_counts[2], 0);
assert_eq!(qidx.rc_counts[2], 2);
}
#[test]
fn test_query_inverted_all_empty_reads() {
let queries = vec![(vec![], vec![]), (vec![], vec![])];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.num_entries(), 0);
assert_eq!(qidx.unique_minimizers().len(), 0);
assert_eq!(qidx.num_reads(), 2);
assert_eq!(qidx.fwd_counts, vec![0, 0]);
assert_eq!(qidx.rc_counts, vec![0, 0]);
}
#[test]
fn test_query_inverted_max_reads_constant() {
assert_eq!(QueryInvertedIndex::MAX_READS, 0x7FFFFFFF);
assert_eq!(QueryInvertedIndex::MAX_READS, (1 << 31) - 1);
}
#[test]
#[should_panic(expected = "31-bit limit")]
fn test_query_inverted_overflow_too_many_reads() {
let queries: Vec<(Vec<u64>, Vec<u64>)> = Vec::new();
let fake_len = QueryInvertedIndex::MAX_READS + 1;
assert!(
fake_len <= QueryInvertedIndex::MAX_READS,
"Batch size {} exceeds maximum {} reads (31-bit limit)",
fake_len,
QueryInvertedIndex::MAX_READS
);
let _ = QueryInvertedIndex::build(&queries);
}
#[test]
fn test_query_inverted_accessor_methods() {
let queries = vec![
(vec![100, 200, 300], vec![150, 250]), (vec![100], vec![150, 250, 350]), ];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.fwd_count(0), 3);
assert_eq!(qidx.fwd_count(1), 1);
assert_eq!(qidx.rc_count(0), 2);
assert_eq!(qidx.rc_count(1), 3);
let mins = qidx.unique_minimizers();
assert!(mins.is_sorted());
assert_eq!(mins.len(), qidx.unique_minimizers().len());
}
#[test]
fn test_coo_entry_count_equals_total_minimizers() {
let queries = vec![
(vec![100, 200, 300], vec![150, 250]), (vec![100], vec![150, 250, 350]), ];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.entries.len(), 9); }
#[test]
fn test_coo_entries_sorted_by_minimizer() {
let queries = vec![(vec![300, 100, 200], vec![250, 150]), (vec![50], vec![400])];
let qidx = QueryInvertedIndex::build(&queries);
assert!(qidx.entries.windows(2).all(|w| w[0].0 <= w[1].0));
}
#[test]
fn test_coo_no_deduplication() {
let queries = vec![(vec![100, 200], vec![]), (vec![100, 300], vec![])];
let qidx = QueryInvertedIndex::build(&queries);
let count_100 = qidx.entries.iter().filter(|(m, _)| *m == 100).count();
assert_eq!(
count_100, 2,
"COO should NOT deduplicate: 2 reads share minimizer 100"
);
}
#[test]
fn test_minimizer_range() {
let queries = vec![(vec![300, 100], vec![500])];
let qidx = QueryInvertedIndex::build(&queries);
assert_eq!(qidx.minimizer_range(), Some((100, 500)));
let empty_queries: Vec<(Vec<u64>, Vec<u64>)> = vec![];
let empty_qidx = QueryInvertedIndex::build(&empty_queries);
assert_eq!(empty_qidx.minimizer_range(), None);
}
#[test]
fn test_from_sorted_coo() {
let entries = vec![(100, 0u32), (100, 1), (200, 0), (300, 1)];
let fwd_counts = vec![2, 1];
let rc_counts = vec![0, 1];
let qidx = QueryInvertedIndex::from_sorted_coo(entries, fwd_counts, rc_counts);
assert_eq!(qidx.num_entries(), 4);
assert_eq!(qidx.unique_minimizers(), vec![100u64, 200, 300]);
assert_eq!(qidx.num_reads(), 2);
}
#[cfg(debug_assertions)]
#[test]
#[should_panic(expected = "entries must be sorted")]
fn test_from_sorted_coo_rejects_unsorted() {
let entries = vec![(200, 0u32), (100, 1)]; QueryInvertedIndex::from_sorted_coo(entries, vec![1], vec![1]);
}
#[test]
fn test_unique_minimizers_capacity_is_minimal() {
let queries = vec![
(vec![100, 200, 300], vec![150, 250]),
(vec![100, 300, 400], vec![150, 350]),
];
let qidx = QueryInvertedIndex::build(&queries);
let mins = qidx.unique_minimizers();
assert_eq!(
mins.capacity(),
mins.len(),
"unique_minimizers() should allocate exactly unique_count capacity, got cap={} len={}",
mins.capacity(),
mins.len()
);
assert_eq!(mins, vec![100, 150, 200, 250, 300, 350, 400]);
}
}