use crate::error::FerroError;
use crate::reference::Strand;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CdotTranscript {
#[serde(default)]
pub gene_name: Option<String>,
#[serde(alias = "contig", alias = "chr")]
pub contig: String,
#[serde(deserialize_with = "deserialize_strand")]
pub strand: Strand,
pub exons: Vec<[u64; 4]>,
#[serde(default)]
pub cds_start: Option<u64>,
#[serde(default)]
pub cds_end: Option<u64>,
#[serde(skip)]
pub exon_cigars: Vec<Option<Vec<CigarOp>>>,
#[serde(default)]
pub gene_id: Option<String>,
#[serde(default)]
pub protein: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] struct RawCdotTranscript {
#[serde(default)]
gene_name: Option<String>,
#[serde(default)]
gene_version: Option<String>,
#[serde(default)]
biotype: Option<Vec<String>>,
#[serde(default)]
genome_builds: Option<HashMap<String, RawGenomeBuild>>,
#[serde(default)]
start_codon: Option<u64>,
#[serde(default)]
stop_codon: Option<u64>,
#[serde(default)]
contig: Option<String>,
#[serde(default)]
strand: Option<String>,
#[serde(default)]
exons: Option<Vec<Vec<serde_json::Value>>>,
#[serde(default)]
cds_start: Option<u64>,
#[serde(default)]
cds_end: Option<u64>,
}
#[derive(Debug, Clone, Deserialize)]
struct RawGenomeBuild {
contig: String,
#[serde(default)]
strand: Option<String>,
exons: Vec<Vec<serde_json::Value>>,
#[serde(default)]
cds_start: Option<u64>,
#[serde(default)]
cds_end: Option<u64>,
}
impl RawCdotTranscript {
fn to_transcript(&self, genome_build: &str) -> Option<CdotTranscript> {
if let Some(builds) = &self.genome_builds {
if let Some(build) = builds.get(genome_build) {
return self.from_genome_build(build);
}
}
self.from_flat_format()
}
#[allow(clippy::wrong_self_convention)]
fn from_genome_build(&self, build: &RawGenomeBuild) -> Option<CdotTranscript> {
let strand = parse_strand(build.strand.as_deref().unwrap_or("+"))?;
let mut exon_pairs: Vec<([u64; 4], Option<Vec<CigarOp>>)> = build
.exons
.iter()
.filter_map(|e| {
if e.len() >= 5 {
let exon = [
e[0].as_u64()?,
e[1].as_u64()?,
e[3].as_u64()?, e[4].as_u64()?, ];
let cigar = if e.len() > 5 {
e[5].as_str().and_then(|s| match parse_cigar(s) {
Ok(ops) => Some(ops),
Err(err) => {
log::warn!("Malformed CIGAR string '{}': {}", s, err);
None
}
})
} else {
None
};
Some((exon, cigar))
} else {
None
}
})
.collect();
exon_pairs.sort_by_key(|(e, _)| e[2]);
let (exons, exon_cigars): (Vec<[u64; 4]>, Vec<Option<Vec<CigarOp>>>) =
exon_pairs.into_iter().unzip();
let (cds_start, cds_end) = if self.start_codon.is_some() || self.stop_codon.is_some() {
(self.start_codon, self.stop_codon)
} else {
match (build.cds_start, build.cds_end) {
(Some(g_start), Some(g_end)) => {
let tx_cds_start = genomic_to_tx_pos(&exons, g_start, strand);
let tx_cds_end = genomic_to_tx_pos(&exons, g_end, strand);
match (tx_cds_start, tx_cds_end) {
(Some(s), Some(e)) => {
let (start, end) = if s < e {
(s, e.saturating_add(1))
} else {
(e, s.saturating_add(1))
};
(Some(start), Some(end))
}
_ => (None, None),
}
}
_ => (None, None),
}
};
Some(CdotTranscript {
gene_name: self.gene_name.clone(),
contig: build.contig.clone(),
strand,
exons,
exon_cigars,
cds_start,
cds_end,
gene_id: None,
protein: None,
})
}
#[allow(clippy::wrong_self_convention)]
fn from_flat_format(&self) -> Option<CdotTranscript> {
let contig = self.contig.as_ref()?.clone();
let strand = parse_strand(self.strand.as_deref().unwrap_or("+"))?;
let exons: Vec<[u64; 4]> = self
.exons
.as_ref()?
.iter()
.filter_map(|e| {
if e.len() >= 4 {
Some([
e[0].as_u64()?,
e[1].as_u64()?,
e[2].as_u64()?,
e[3].as_u64()?,
])
} else {
None
}
})
.collect();
Some(CdotTranscript {
gene_name: self.gene_name.clone(),
contig,
strand,
exon_cigars: Vec::new(),
exons,
cds_start: self.cds_start,
cds_end: self.cds_end,
gene_id: None,
protein: None,
})
}
}
fn parse_strand(s: &str) -> Option<Strand> {
match s {
"+" | "1" | "plus" => Some(Strand::Plus),
"-" | "-1" | "minus" => Some(Strand::Minus),
_ => None,
}
}
fn genomic_to_tx_pos(exons: &[[u64; 4]], genomic_pos: u64, strand: Strand) -> Option<u64> {
for e in exons {
let (g_start, g_end, tx_start, _tx_end) = (e[0], e[1], e[2], e[3]);
if genomic_pos >= g_start && genomic_pos < g_end {
let offset = match strand {
Strand::Plus => genomic_pos - g_start,
Strand::Minus => g_end - 1 - genomic_pos,
};
return Some(tx_start + offset);
}
}
for e in exons {
let (g_start, g_end, tx_start, tx_end) = (e[0], e[1], e[2], e[3]);
if genomic_pos == g_end {
return Some(tx_end);
}
if genomic_pos == g_start.saturating_sub(1) {
return Some(tx_start.saturating_sub(1));
}
}
None
}
fn deserialize_strand<'de, D>(deserializer: D) -> Result<Strand, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
match s.as_str() {
"+" | "1" | "plus" => Ok(Strand::Plus),
"-" | "-1" | "minus" => Ok(Strand::Minus),
_ => Err(serde::de::Error::custom(format!("Invalid strand: {}", s))),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Exon {
pub number: u32,
pub genome_start: u64,
pub genome_end: u64,
pub tx_start: u64,
pub tx_end: u64,
}
impl Exon {
pub fn from_array(number: u32, arr: [u64; 4]) -> Self {
Self {
number,
genome_start: arr[0],
genome_end: arr[1],
tx_start: arr[2],
tx_end: arr[3],
}
}
pub fn len(&self) -> u64 {
self.genome_end - self.genome_start
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn contains_genome_pos(&self, pos: u64) -> bool {
pos >= self.genome_start && pos < self.genome_end
}
pub fn contains_tx_pos(&self, pos: u64) -> bool {
pos >= self.tx_start && pos < self.tx_end
}
}
impl CdotTranscript {
pub fn get_exons(&self) -> Vec<Exon> {
self.exons
.iter()
.enumerate()
.map(|(i, arr)| Exon::from_array((i + 1) as u32, *arr))
.collect()
}
pub fn transcript_length(&self) -> u64 {
self.exons.iter().map(|e| e[3] - e[2]).sum()
}
pub fn cds_length(&self) -> Option<u64> {
match (self.cds_start, self.cds_end) {
(Some(start), Some(end)) => Some(end - start),
_ => None,
}
}
pub fn exon_for_tx_pos(&self, tx_pos: u64) -> Option<Exon> {
for (i, arr) in self.exons.iter().enumerate() {
if tx_pos >= arr[2] && tx_pos < arr[3] {
return Some(Exon::from_array((i + 1) as u32, *arr));
}
}
None
}
pub fn exon_for_genome_pos(&self, genome_pos: u64) -> Option<Exon> {
for (i, arr) in self.exons.iter().enumerate() {
if genome_pos >= arr[0] && genome_pos < arr[1] {
return Some(Exon::from_array((i + 1) as u32, *arr));
}
}
None
}
pub fn tx_to_genome(&self, tx_pos: u64) -> Option<u64> {
let exon = self.exon_for_tx_pos(tx_pos)?;
let offset = tx_pos - exon.tx_start;
match self.strand {
Strand::Plus => Some(exon.genome_start + offset),
Strand::Minus => Some(exon.genome_end - 1 - offset),
}
}
pub fn genome_to_tx(&self, genome_pos: u64) -> Option<u64> {
let exon = self.exon_for_genome_pos(genome_pos)?;
match self.strand {
Strand::Plus => {
let offset = genome_pos - exon.genome_start;
Some(exon.tx_start + offset)
}
Strand::Minus => {
let offset = exon.genome_end - 1 - genome_pos;
Some(exon.tx_start + offset)
}
}
}
pub fn cds_to_tx(&self, cds_pos: i64) -> Option<u64> {
let cds_start = self.cds_start?;
if cds_pos > 0 {
Some(cds_start + (cds_pos as u64 - 1))
} else if cds_pos < 0 {
let offset = (-cds_pos) as u64;
if offset <= cds_start {
Some(cds_start - offset)
} else {
None }
} else {
None }
}
pub fn tx_to_cds(&self, tx_pos: u64) -> Option<CdsPosition> {
let cds_start = self.cds_start?;
let cds_end = self.cds_end?;
if tx_pos < cds_start {
let offset = cds_start - tx_pos;
Some(CdsPosition::FivePrimeUtr(offset as i64))
} else if tx_pos < cds_end {
let cds_pos = tx_pos - cds_start + 1;
Some(CdsPosition::Cds(cds_pos as i64))
} else {
let offset = tx_pos - cds_end + 1;
Some(CdsPosition::ThreePrimeUtr(offset as i64))
}
}
pub fn intron_for_genome_pos(&self, genome_pos: u64) -> Option<(u32, i64)> {
let exons = self.get_exons();
for i in 0..exons.len().saturating_sub(1) {
let current = &exons[i];
let next = &exons[i + 1];
let (intron_start, intron_end) = match self.strand {
Strand::Plus => (current.genome_end, next.genome_start),
Strand::Minus => (next.genome_end, current.genome_start),
};
if genome_pos >= intron_start && genome_pos < intron_end {
let intron_num = (i + 1) as u32;
let offset = match self.strand {
Strand::Plus => (genome_pos as i64) - (current.genome_end as i64),
Strand::Minus => (current.genome_start as i64) - (genome_pos as i64),
};
let intronic_offset = if offset >= 0 { offset + 1 } else { offset };
return Some((intron_num, intronic_offset));
}
}
None
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CdsPosition {
FivePrimeUtr(i64),
Cds(i64),
ThreePrimeUtr(i64),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarOp {
Match(u64),
Insertion(u64),
Deletion(u64),
}
pub fn parse_cigar(cigar_str: &str) -> Result<Vec<CigarOp>, FerroError> {
let trimmed = cigar_str.trim();
if trimmed.is_empty() {
return Ok(Vec::new());
}
trimmed
.split_whitespace()
.map(|token| {
if token.len() < 2 {
return Err(FerroError::parse(
0,
format!("Invalid CIGAR token (too short): \'{token}\'"),
));
}
let (op_char, len_str) = token.split_at(1);
let length: u64 = len_str.parse().map_err(|_| {
FerroError::parse(0, format!("Invalid CIGAR length in token: \'{token}\'"))
})?;
match op_char {
"M" => Ok(CigarOp::Match(length)),
"I" => Ok(CigarOp::Insertion(length)),
"D" => Ok(CigarOp::Deletion(length)),
_ => Err(FerroError::parse(
0,
format!("Unknown CIGAR operation \'{op_char}\' in token: \'{token}\'"),
)),
}
})
.collect()
}
pub fn cumulative_insertion_offset(ops: &[CigarOp], tx_pos: u64) -> u64 {
let mut current_tx: u64 = 0;
let mut cumulative: u64 = 0;
for op in ops {
match op {
CigarOp::Match(len) => {
current_tx += len;
}
CigarOp::Insertion(len) => {
if current_tx + len <= tx_pos {
cumulative += len;
}
current_tx += len;
}
CigarOp::Deletion(_) => {
}
}
if current_tx >= tx_pos {
break;
}
}
cumulative
}
#[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] struct RawCdotFile {
transcripts: HashMap<String, RawCdotTranscript>,
#[serde(default)]
genome_builds: Option<Vec<String>>,
#[serde(default)]
cdot_version: Option<String>,
#[serde(default)]
genes: Option<serde_json::Value>,
#[serde(default)]
metadata: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CdotFile {
pub transcripts: HashMap<String, CdotTranscript>,
#[serde(default)]
pub genome_builds: Option<HashMap<String, serde_json::Value>>,
}
#[derive(Debug, Clone)]
pub struct CdotMapper {
transcripts: HashMap<String, CdotTranscript>,
contig_index: HashMap<String, Vec<String>>,
base_to_versioned: HashMap<String, String>,
lrg_to_refseq: HashMap<String, String>,
}
impl CdotMapper {
pub fn new() -> Self {
Self {
transcripts: HashMap::new(),
contig_index: HashMap::new(),
base_to_versioned: HashMap::new(),
lrg_to_refseq: HashMap::new(),
}
}
pub fn from_json_file<P: AsRef<Path>>(path: P) -> Result<Self, FerroError> {
let file = File::open(path.as_ref()).map_err(|e| FerroError::Io {
msg: format!("Failed to open cdot file: {}", e),
})?;
let reader = BufReader::new(file);
Self::from_reader(reader)
}
pub fn from_json_gz<P: AsRef<Path>>(path: P) -> Result<Self, FerroError> {
let file = File::open(path.as_ref()).map_err(|e| FerroError::Io {
msg: format!("Failed to open cdot file: {}", e),
})?;
let reader = BufReader::new(file);
let decoder = flate2::read::GzDecoder::new(reader);
let reader = BufReader::new(decoder);
Self::from_reader(reader)
}
pub fn from_reader<R: Read>(reader: R) -> Result<Self, FerroError> {
Self::from_reader_with_build(reader, "GRCh38")
}
pub fn from_reader_with_build<R: Read>(
reader: R,
genome_build: &str,
) -> Result<Self, FerroError> {
let raw_file: RawCdotFile = serde_json::from_reader(reader)?;
Ok(Self::from_raw_cdot_file(raw_file, genome_build))
}
fn from_raw_cdot_file(raw_file: RawCdotFile, genome_build: &str) -> Self {
let mut mapper = Self::new();
for (accession, raw_transcript) in raw_file.transcripts {
if let Some(transcript) = raw_transcript.to_transcript(genome_build) {
mapper.add_transcript(accession, transcript);
}
}
mapper
}
pub fn from_cdot_file(cdot_file: CdotFile) -> Self {
let mut mapper = Self::new();
for (accession, transcript) in cdot_file.transcripts {
mapper.add_transcript(accession, transcript);
}
mapper
}
pub fn add_transcript(&mut self, accession: String, transcript: CdotTranscript) {
self.contig_index
.entry(transcript.contig.clone())
.or_default()
.push(accession.clone());
if let Some(base) = accession.split('.').next() {
self.base_to_versioned
.insert(base.to_string(), accession.clone());
}
self.transcripts.insert(accession, transcript);
}
pub fn load_lrg_mapping<P: AsRef<Path>>(&mut self, path: P) -> Result<usize, FerroError> {
use std::io::BufRead;
let file = File::open(path.as_ref()).map_err(|e| FerroError::Io {
msg: format!("Failed to open LRG mapping file: {}", e),
})?;
let reader = std::io::BufReader::new(file);
let mut count = 0;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read LRG mapping line: {}", e),
})?;
if line.starts_with('#') || line.trim().is_empty() {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 5 {
continue;
}
let lrg_id = fields[0]; let lrg_transcript = fields[3]; let refseq_transcript = fields[4];
if refseq_transcript.is_empty() || refseq_transcript == "-" {
continue;
}
let lrg_full = format!("{}{}", lrg_id, lrg_transcript);
self.lrg_to_refseq
.insert(lrg_full, refseq_transcript.to_string());
count += 1;
}
Ok(count)
}
pub fn get_refseq_for_lrg(&self, lrg_accession: &str) -> Option<&str> {
self.lrg_to_refseq.get(lrg_accession).map(|s| s.as_str())
}
pub fn get_transcript(&self, accession: &str) -> Option<&CdotTranscript> {
if let Some(tx) = self.transcripts.get(accession) {
return Some(tx);
}
if let Some(base) = accession.split('.').next() {
if let Some(versioned) = self.base_to_versioned.get(base) {
if let Some(tx) = self.transcripts.get(versioned) {
return Some(tx);
}
}
}
if accession.starts_with("LRG_") {
if let Some(refseq) = self.lrg_to_refseq.get(accession) {
if let Some(tx) = self.transcripts.get(refseq) {
return Some(tx);
}
if let Some(base) = refseq.split('.').next() {
if let Some(versioned) = self.base_to_versioned.get(base) {
if let Some(tx) = self.transcripts.get(versioned) {
return Some(tx);
}
}
}
}
}
None
}
pub fn transcripts_on_contig(&self, contig: &str) -> Vec<&str> {
self.contig_index
.get(contig)
.map(|ids| ids.iter().map(|s| s.as_str()).collect())
.unwrap_or_default()
}
pub fn transcripts_at_position(&self, contig: &str, pos: u64) -> Vec<(&str, &CdotTranscript)> {
self.contig_index
.get(contig)
.map(|ids| {
ids.iter()
.filter_map(|id| {
let tx = self.transcripts.get(id)?;
let (min, max) = tx.exons.iter().fold((u64::MAX, 0), |(min, max), e| {
(min.min(e[0]), max.max(e[1]))
});
if pos >= min && pos < max {
Some((id.as_str(), tx))
} else {
None
}
})
.collect()
})
.unwrap_or_default()
}
pub fn transcript_count(&self) -> usize {
self.transcripts.len()
}
pub fn transcript_ids(&self) -> impl Iterator<Item = &str> {
self.transcripts.keys().map(|s| s.as_str())
}
}
impl Default for CdotMapper {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_transcript() -> CdotTranscript {
CdotTranscript {
gene_name: Some("TEST".to_string()),
contig: "NC_000001.11".to_string(),
strand: Strand::Plus,
exons: vec![
[1000, 1100, 0, 100], [2000, 2200, 100, 300], [3000, 3150, 300, 450], ],
cds_start: Some(50), cds_end: Some(400), exon_cigars: Vec::new(),
gene_id: None,
protein: None,
}
}
fn minus_strand_transcript() -> CdotTranscript {
CdotTranscript {
gene_name: Some("MINUS".to_string()),
contig: "NC_000001.11".to_string(),
strand: Strand::Minus,
exons: vec![
[3000, 3150, 0, 150], [2000, 2200, 150, 350], [1000, 1100, 350, 450], ],
cds_start: Some(50),
cds_end: Some(400),
exon_cigars: Vec::new(),
gene_id: None,
protein: None,
}
}
#[test]
fn test_transcript_length() {
let tx = sample_transcript();
assert_eq!(tx.transcript_length(), 450);
}
#[test]
fn test_cds_length() {
let tx = sample_transcript();
assert_eq!(tx.cds_length(), Some(350));
}
#[test]
fn test_exon_for_tx_pos() {
let tx = sample_transcript();
let exon = tx.exon_for_tx_pos(50).unwrap();
assert_eq!(exon.number, 1);
let exon = tx.exon_for_tx_pos(150).unwrap();
assert_eq!(exon.number, 2);
let exon = tx.exon_for_tx_pos(350).unwrap();
assert_eq!(exon.number, 3);
assert!(tx.exon_for_tx_pos(500).is_none());
}
#[test]
fn test_exon_for_genome_pos() {
let tx = sample_transcript();
let exon = tx.exon_for_genome_pos(1050).unwrap();
assert_eq!(exon.number, 1);
let exon = tx.exon_for_genome_pos(2100).unwrap();
assert_eq!(exon.number, 2);
assert!(tx.exon_for_genome_pos(1500).is_none());
}
#[test]
fn test_tx_to_genome_plus_strand() {
let tx = sample_transcript();
assert_eq!(tx.tx_to_genome(0), Some(1000));
assert_eq!(tx.tx_to_genome(50), Some(1050));
assert_eq!(tx.tx_to_genome(99), Some(1099));
assert_eq!(tx.tx_to_genome(100), Some(2000));
assert_eq!(tx.tx_to_genome(200), Some(2100));
assert_eq!(tx.tx_to_genome(300), Some(3000));
}
#[test]
fn test_tx_to_genome_minus_strand() {
let tx = minus_strand_transcript();
assert_eq!(tx.tx_to_genome(0), Some(3149));
assert_eq!(tx.tx_to_genome(50), Some(3099));
assert_eq!(tx.tx_to_genome(149), Some(3000));
assert_eq!(tx.tx_to_genome(150), Some(2199));
}
#[test]
fn test_genome_to_tx_plus_strand() {
let tx = sample_transcript();
assert_eq!(tx.genome_to_tx(1000), Some(0));
assert_eq!(tx.genome_to_tx(1050), Some(50));
assert_eq!(tx.genome_to_tx(2000), Some(100));
assert!(tx.genome_to_tx(1500).is_none());
}
#[test]
fn test_genome_to_tx_minus_strand() {
let tx = minus_strand_transcript();
assert_eq!(tx.genome_to_tx(3149), Some(0));
assert_eq!(tx.genome_to_tx(3000), Some(149));
assert_eq!(tx.genome_to_tx(2199), Some(150));
}
#[test]
fn test_cds_to_tx() {
let tx = sample_transcript();
assert_eq!(tx.cds_to_tx(1), Some(50)); assert_eq!(tx.cds_to_tx(100), Some(149));
assert_eq!(tx.cds_to_tx(-1), Some(49)); assert_eq!(tx.cds_to_tx(-50), Some(0));
assert!(tx.cds_to_tx(0).is_none());
}
#[test]
fn test_tx_to_cds() {
let tx = sample_transcript();
assert_eq!(tx.tx_to_cds(0), Some(CdsPosition::FivePrimeUtr(50)));
assert_eq!(tx.tx_to_cds(49), Some(CdsPosition::FivePrimeUtr(1)));
assert_eq!(tx.tx_to_cds(50), Some(CdsPosition::Cds(1)));
assert_eq!(tx.tx_to_cds(149), Some(CdsPosition::Cds(100)));
assert_eq!(tx.tx_to_cds(399), Some(CdsPosition::Cds(350)));
assert_eq!(tx.tx_to_cds(400), Some(CdsPosition::ThreePrimeUtr(1)));
assert_eq!(tx.tx_to_cds(449), Some(CdsPosition::ThreePrimeUtr(50)));
}
#[test]
fn test_exon_len() {
let exon = Exon::from_array(1, [1000, 1100, 0, 100]);
assert_eq!(exon.len(), 100);
assert!(!exon.is_empty());
}
#[test]
fn test_exon_contains() {
let exon = Exon::from_array(1, [1000, 1100, 0, 100]);
assert!(exon.contains_genome_pos(1000));
assert!(exon.contains_genome_pos(1099));
assert!(!exon.contains_genome_pos(1100));
assert!(exon.contains_tx_pos(0));
assert!(exon.contains_tx_pos(99));
assert!(!exon.contains_tx_pos(100));
}
#[test]
fn test_mapper_new() {
let mapper = CdotMapper::new();
assert_eq!(mapper.transcript_count(), 0);
}
#[test]
fn test_mapper_add_transcript() {
let mut mapper = CdotMapper::new();
mapper.add_transcript("NM_000088.3".to_string(), sample_transcript());
assert_eq!(mapper.transcript_count(), 1);
assert!(mapper.get_transcript("NM_000088.3").is_some());
assert!(mapper.get_transcript("NM_000088.4").is_some());
assert!(mapper.get_transcript("NM_999999.1").is_none());
}
#[test]
fn test_mapper_contig_index() {
let mut mapper = CdotMapper::new();
mapper.add_transcript("NM_000088.3".to_string(), sample_transcript());
mapper.add_transcript("NM_000088.4".to_string(), sample_transcript());
let tx_ids = mapper.transcripts_on_contig("NC_000001.11");
assert_eq!(tx_ids.len(), 2);
assert!(tx_ids.contains(&"NM_000088.3"));
assert!(tx_ids.contains(&"NM_000088.4"));
let tx_ids = mapper.transcripts_on_contig("NC_000002.12");
assert!(tx_ids.is_empty());
}
#[test]
fn test_mapper_transcripts_at_position() {
let mut mapper = CdotMapper::new();
mapper.add_transcript("NM_000088.3".to_string(), sample_transcript());
let results = mapper.transcripts_at_position("NC_000001.11", 2050);
assert_eq!(results.len(), 1);
let results = mapper.transcripts_at_position("NC_000001.11", 5000);
assert!(results.is_empty());
}
#[test]
fn test_mapper_from_json() {
let json = r#"
{
"transcripts": {
"NM_000088.3": {
"gene_name": "COL1A1",
"contig": "NC_000017.11",
"strand": "+",
"exons": [
[50184096, 50184169, 0, 73],
[50185022, 50185148, 73, 199]
],
"cds_start": 149,
"cds_end": 4544
}
}
}
"#;
let mapper = CdotMapper::from_reader(json.as_bytes()).unwrap();
assert_eq!(mapper.transcript_count(), 1);
let tx = mapper.get_transcript("NM_000088.3").unwrap();
assert_eq!(tx.gene_name.as_deref(), Some("COL1A1"));
assert_eq!(tx.strand, Strand::Plus);
assert_eq!(tx.exons.len(), 2);
}
#[test]
fn test_parse_cigar_simple_match() {
let ops = parse_cigar("M185").unwrap();
assert_eq!(ops, vec![CigarOp::Match(185)]);
}
#[test]
fn test_parse_cigar_with_insertion() {
let ops = parse_cigar("M185 I3 M250").unwrap();
assert_eq!(
ops,
vec![
CigarOp::Match(185),
CigarOp::Insertion(3),
CigarOp::Match(250),
]
);
}
#[test]
fn test_parse_cigar_with_deletion() {
let ops = parse_cigar("M504 D2 M123").unwrap();
assert_eq!(
ops,
vec![
CigarOp::Match(504),
CigarOp::Deletion(2),
CigarOp::Match(123),
]
);
}
#[test]
fn test_parse_cigar_complex() {
let ops = parse_cigar("M6 D1 M4 I2 M3").unwrap();
assert_eq!(
ops,
vec![
CigarOp::Match(6),
CigarOp::Deletion(1),
CigarOp::Match(4),
CigarOp::Insertion(2),
CigarOp::Match(3),
]
);
}
#[test]
fn test_parse_cigar_empty() {
let ops = parse_cigar("").unwrap();
assert!(ops.is_empty());
}
#[test]
fn test_parse_cigar_whitespace_only() {
let ops = parse_cigar(" ").unwrap();
assert!(ops.is_empty());
}
#[test]
fn test_parse_cigar_invalid_operation() {
let result = parse_cigar("X5");
assert!(result.is_err());
}
#[test]
fn test_parse_cigar_invalid_length() {
let result = parse_cigar("Mabc");
assert!(result.is_err());
}
#[test]
fn test_parse_cigar_too_short_token() {
let result = parse_cigar("M");
assert!(result.is_err());
}
#[test]
fn test_cumulative_insertion_offset_before_insertion() {
let cigar = vec![
CigarOp::Match(185),
CigarOp::Insertion(3),
CigarOp::Match(250),
];
assert_eq!(cumulative_insertion_offset(&cigar, 100), 0);
}
#[test]
fn test_cumulative_insertion_offset_after_insertion() {
let cigar = vec![
CigarOp::Match(185),
CigarOp::Insertion(3),
CigarOp::Match(250),
];
assert_eq!(cumulative_insertion_offset(&cigar, 200), 3);
}
#[test]
fn test_cumulative_insertion_offset_at_end() {
let cigar = vec![
CigarOp::Match(185),
CigarOp::Insertion(3),
CigarOp::Match(250),
];
assert_eq!(cumulative_insertion_offset(&cigar, 437), 3);
}
#[test]
fn test_cumulative_insertion_offset_no_insertions() {
let cigar = vec![CigarOp::Match(500)];
assert_eq!(cumulative_insertion_offset(&cigar, 250), 0);
}
#[test]
fn test_cumulative_insertion_offset_multiple_insertions() {
let cigar = vec![
CigarOp::Match(100),
CigarOp::Insertion(2),
CigarOp::Match(100),
CigarOp::Insertion(5),
CigarOp::Match(100),
];
assert_eq!(cumulative_insertion_offset(&cigar, 50), 0);
assert_eq!(cumulative_insertion_offset(&cigar, 150), 2);
assert_eq!(cumulative_insertion_offset(&cigar, 300), 7);
}
#[test]
fn test_cumulative_insertion_offset_with_deletion() {
let cigar = vec![
CigarOp::Match(100),
CigarOp::Deletion(5),
CigarOp::Match(100),
CigarOp::Insertion(3),
CigarOp::Match(100),
];
assert_eq!(cumulative_insertion_offset(&cigar, 150), 0);
assert_eq!(cumulative_insertion_offset(&cigar, 250), 3);
}
}