1use std::collections::BTreeMap;
12
13use cyanea_core::{CyaneaError, Result};
14
15use crate::genomic::{GenomicInterval, Strand};
16
17#[derive(Debug, Clone)]
19#[allow(dead_code)]
20struct AlignmentBlock {
21 source_start: u64,
22 source_end: u64,
23 target_start: u64,
24 target_end: u64,
25}
26
27#[derive(Debug, Clone)]
29#[allow(dead_code)]
30struct Chain {
31 score: u64,
32 source_chrom: String,
33 source_size: u64,
34 source_strand: Strand,
35 source_start: u64,
36 source_end: u64,
37 target_chrom: String,
38 target_size: u64,
39 target_strand: Strand,
40 target_start: u64,
41 target_end: u64,
42 blocks: Vec<AlignmentBlock>,
43}
44
45#[derive(Debug, Clone)]
47pub struct ChainFile {
48 chains: BTreeMap<String, Vec<Chain>>,
49}
50
51#[derive(Debug, Clone, PartialEq)]
53pub enum LiftoverResult {
54 Mapped(GenomicInterval),
56 Partial {
58 mapped: GenomicInterval,
59 fraction_mapped: f64,
60 },
61 Unmapped,
63}
64
65fn parse_strand(s: &str) -> Result<Strand> {
66 match s {
67 "+" => Ok(Strand::Forward),
68 "-" => Ok(Strand::Reverse),
69 _ => Err(CyaneaError::Parse(format!(
70 "invalid strand '{s}'"
71 ))),
72 }
73}
74
75pub fn parse_chain(input: &str) -> Result<ChainFile> {
90 let mut chains: BTreeMap<String, Vec<Chain>> = BTreeMap::new();
91 let mut current_chain: Option<Chain> = None;
92 let mut source_offset = 0u64;
93 let mut target_offset = 0u64;
94
95 for line in input.lines() {
96 let line = line.trim();
97 if line.is_empty() {
98 continue;
99 }
100
101 if line.starts_with("chain ") {
102 if let Some(chain) = current_chain.take() {
104 chains
105 .entry(chain.source_chrom.clone())
106 .or_default()
107 .push(chain);
108 }
109
110 let fields: Vec<&str> = line.split_whitespace().collect();
111 if fields.len() < 12 {
112 return Err(CyaneaError::Parse(format!(
113 "chain header requires at least 12 fields, got {}: {line}",
114 fields.len()
115 )));
116 }
117
118 let score: u64 = fields[1]
119 .parse()
120 .map_err(|_| CyaneaError::Parse(format!("invalid score: {}", fields[1])))?;
121
122 let source_chrom = fields[2].to_string();
124 let source_size: u64 = fields[3].parse().map_err(|_| {
125 CyaneaError::Parse(format!("invalid source size: {}", fields[3]))
126 })?;
127 let source_strand = parse_strand(fields[4])?;
128 let source_start: u64 = fields[5].parse().map_err(|_| {
129 CyaneaError::Parse(format!("invalid source start: {}", fields[5]))
130 })?;
131 let source_end: u64 = fields[6].parse().map_err(|_| {
132 CyaneaError::Parse(format!("invalid source end: {}", fields[6]))
133 })?;
134
135 let target_chrom = fields[7].to_string();
137 let target_size: u64 = fields[8].parse().map_err(|_| {
138 CyaneaError::Parse(format!("invalid target size: {}", fields[8]))
139 })?;
140 let target_strand = parse_strand(fields[9])?;
141 let target_start: u64 = fields[10].parse().map_err(|_| {
142 CyaneaError::Parse(format!("invalid target start: {}", fields[10]))
143 })?;
144 let target_end: u64 = fields[11].parse().map_err(|_| {
145 CyaneaError::Parse(format!("invalid target end: {}", fields[11]))
146 })?;
147
148 source_offset = source_start;
149 target_offset = target_start;
150
151 current_chain = Some(Chain {
152 score,
153 source_chrom,
154 source_size,
155 source_strand,
156 source_start,
157 source_end,
158 target_chrom,
159 target_size,
160 target_strand,
161 target_start,
162 target_end,
163 blocks: Vec::new(),
164 });
165
166 continue;
167 }
168
169 if let Some(ref mut chain) = current_chain {
171 let fields: Vec<&str> = line.split_whitespace().collect();
172 if fields.is_empty() {
173 continue;
174 }
175
176 let size: u64 = fields[0].parse().map_err(|_| {
177 CyaneaError::Parse(format!("invalid block size: {}", fields[0]))
178 })?;
179
180 chain.blocks.push(AlignmentBlock {
181 source_start: source_offset,
182 source_end: source_offset + size,
183 target_start: target_offset,
184 target_end: target_offset + size,
185 });
186
187 if fields.len() >= 3 {
188 let dt: u64 = fields[1].parse().map_err(|_| {
189 CyaneaError::Parse(format!("invalid dt: {}", fields[1]))
190 })?;
191 let dq: u64 = fields[2].parse().map_err(|_| {
192 CyaneaError::Parse(format!("invalid dq: {}", fields[2]))
193 })?;
194 source_offset += size + dt;
195 target_offset += size + dq;
196 }
197 }
199 }
200
201 if let Some(chain) = current_chain.take() {
203 chains
204 .entry(chain.source_chrom.clone())
205 .or_default()
206 .push(chain);
207 }
208
209 for chains_vec in chains.values_mut() {
211 chains_vec.sort_by(|a, b| b.score.cmp(&a.score));
212 }
213
214 Ok(ChainFile { chains })
215}
216
217fn reverse_to_forward(pos: u64, chrom_size: u64) -> u64 {
219 chrom_size - pos
220}
221
222pub fn liftover(
227 chain_file: &ChainFile,
228 interval: &GenomicInterval,
229 min_match: f64,
230) -> LiftoverResult {
231 let chains = match chain_file.chains.get(&interval.chrom) {
232 Some(c) => c,
233 None => return LiftoverResult::Unmapped,
234 };
235
236 let chain = match chains.iter().find(|c| {
238 c.source_start <= interval.start && c.source_end >= interval.end
239 }) {
240 Some(c) => c,
241 None => {
242 match chains.iter().find(|c| {
244 c.source_start < interval.end && c.source_end > interval.start
245 }) {
246 Some(c) => c,
247 None => return LiftoverResult::Unmapped,
248 }
249 }
250 };
251
252 let mut mapped_bp = 0u64;
254 let mut target_min = u64::MAX;
255 let mut target_max = 0u64;
256 let interval_len = interval.len();
257
258 for block in &chain.blocks {
259 if block.source_end <= interval.start || block.source_start >= interval.end {
260 continue;
261 }
262
263 let overlap_start = block.source_start.max(interval.start);
265 let overlap_end = block.source_end.min(interval.end);
266 let overlap_len = overlap_end - overlap_start;
267 mapped_bp += overlap_len;
268
269 let offset_in_block = overlap_start - block.source_start;
271 let t_start = block.target_start + offset_in_block;
272 let t_end = t_start + overlap_len;
273
274 target_min = target_min.min(t_start);
275 target_max = target_max.max(t_end);
276 }
277
278 if mapped_bp == 0 {
279 return LiftoverResult::Unmapped;
280 }
281
282 let fraction = mapped_bp as f64 / interval_len as f64;
283
284 let (final_start, final_end) = if chain.target_strand == Strand::Reverse {
286 let fwd_start = reverse_to_forward(target_max, chain.target_size);
287 let fwd_end = reverse_to_forward(target_min, chain.target_size);
288 (fwd_start, fwd_end)
289 } else {
290 (target_min, target_max)
291 };
292
293 let mapped = GenomicInterval {
295 chrom: chain.target_chrom.clone(),
296 start: final_start,
297 end: final_end,
298 strand: interval.strand,
299 };
300
301 if fraction >= min_match {
302 if (fraction - 1.0).abs() < 1e-10 {
303 LiftoverResult::Mapped(mapped)
304 } else {
305 LiftoverResult::Partial {
306 mapped,
307 fraction_mapped: fraction,
308 }
309 }
310 } else {
311 LiftoverResult::Unmapped
312 }
313}
314
315pub fn liftover_batch(
317 chain_file: &ChainFile,
318 intervals: &[GenomicInterval],
319 min_match: f64,
320) -> Vec<LiftoverResult> {
321 intervals
322 .iter()
323 .map(|iv| liftover(chain_file, iv, min_match))
324 .collect()
325}
326
327#[cfg(test)]
328mod tests {
329 use super::*;
330
331 fn iv(chrom: &str, start: u64, end: u64) -> GenomicInterval {
332 GenomicInterval::new(chrom, start, end).unwrap()
333 }
334
335 const SIMPLE_CHAIN: &str = "\
341chain 1000 chr1 1000 + 100 600 chrA 800 + 50 500 1
342200 100 50
343200
344";
345
346 #[test]
347 fn test_parse_basic_chain() {
348 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
349 assert!(cf.chains.contains_key("chr1"));
350 let chains = &cf.chains["chr1"];
351 assert_eq!(chains.len(), 1);
352 assert_eq!(chains[0].score, 1000);
353 assert_eq!(chains[0].source_chrom, "chr1");
354 assert_eq!(chains[0].target_chrom, "chrA");
355 assert_eq!(chains[0].blocks.len(), 2);
356
357 assert_eq!(chains[0].blocks[0].source_start, 100);
359 assert_eq!(chains[0].blocks[0].source_end, 300);
360 assert_eq!(chains[0].blocks[0].target_start, 50);
361 assert_eq!(chains[0].blocks[0].target_end, 250);
362
363 assert_eq!(chains[0].blocks[1].source_start, 400);
365 assert_eq!(chains[0].blocks[1].source_end, 600);
366 assert_eq!(chains[0].blocks[1].target_start, 300);
367 assert_eq!(chains[0].blocks[1].target_end, 500);
368 }
369
370 #[test]
371 fn test_parse_multiple_chains() {
372 let input = "\
373chain 5000 chr1 1000 + 0 500 chrA 800 + 0 400 1
374200 100 50
375200
376
377chain 3000 chr2 2000 + 100 400 chrB 1500 + 200 500 2
378300
379";
380 let cf = parse_chain(input).unwrap();
381 assert!(cf.chains.contains_key("chr1"));
382 assert!(cf.chains.contains_key("chr2"));
383 assert_eq!(cf.chains["chr2"][0].blocks.len(), 1);
384 }
385
386 #[test]
387 fn test_parse_reverse_strand() {
388 let input = "\
389chain 2000 chr1 1000 + 100 400 chrA 800 - 300 600 1
390300
391";
392 let cf = parse_chain(input).unwrap();
393 let chain = &cf.chains["chr1"][0];
394 assert_eq!(chain.source_strand, Strand::Forward);
395 assert_eq!(chain.target_strand, Strand::Reverse);
396 }
397
398 #[test]
399 fn test_parse_invalid_format() {
400 let input = "chain 1000 chr1";
401 assert!(parse_chain(input).is_err());
402 }
403
404 #[test]
405 fn test_liftover_within_single_block() {
406 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
407 let result = liftover(&cf, &iv("chr1", 150, 250), 0.95);
410 match result {
411 LiftoverResult::Mapped(mapped) => {
412 assert_eq!(mapped.chrom, "chrA");
413 assert_eq!(mapped.start, 100);
414 assert_eq!(mapped.end, 200);
415 }
416 other => panic!("expected Mapped, got {:?}", other),
417 }
418 }
419
420 #[test]
421 fn test_liftover_spanning_blocks() {
422 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
423 let result = liftover(&cf, &iv("chr1", 200, 500), 0.5);
429 match result {
430 LiftoverResult::Partial {
431 mapped,
432 fraction_mapped,
433 } => {
434 assert_eq!(mapped.chrom, "chrA");
435 assert_eq!(mapped.start, 150);
436 assert_eq!(mapped.end, 400);
437 assert!((fraction_mapped - 200.0 / 300.0).abs() < 1e-10);
438 }
439 other => panic!("expected Partial, got {:?}", other),
440 }
441 }
442
443 #[test]
444 fn test_liftover_in_gap() {
445 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
446 let result = liftover(&cf, &iv("chr1", 310, 390), 0.95);
448 assert_eq!(result, LiftoverResult::Unmapped);
449 }
450
451 #[test]
452 fn test_liftover_reverse_strand() {
453 let input = "\
454chain 2000 chr1 1000 + 100 400 chrA 800 - 300 600 1
455300
456";
457 let cf = parse_chain(input).unwrap();
458 let result = liftover(&cf, &iv("chr1", 150, 250), 0.95);
463 match result {
464 LiftoverResult::Mapped(mapped) => {
465 assert_eq!(mapped.chrom, "chrA");
466 assert_eq!(mapped.start, 350);
467 assert_eq!(mapped.end, 450);
468 }
469 other => panic!("expected Mapped, got {:?}", other),
470 }
471 }
472
473 #[test]
474 fn test_liftover_no_chain_for_chrom() {
475 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
476 let result = liftover(&cf, &iv("chrX", 100, 200), 0.95);
477 assert_eq!(result, LiftoverResult::Unmapped);
478 }
479
480 #[test]
481 fn test_liftover_min_match_threshold() {
482 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
483 let strict = liftover(&cf, &iv("chr1", 200, 500), 0.95);
485 assert_eq!(strict, LiftoverResult::Unmapped);
486
487 let lenient = liftover(&cf, &iv("chr1", 200, 500), 0.5);
488 assert!(matches!(lenient, LiftoverResult::Partial { .. }));
489 }
490
491 #[test]
492 fn test_liftover_batch_consistency() {
493 let cf = parse_chain(SIMPLE_CHAIN).unwrap();
494 let intervals = vec![iv("chr1", 150, 250), iv("chr1", 310, 390), iv("chrX", 0, 100)];
495 let results = liftover_batch(&cf, &intervals, 0.95);
496 assert_eq!(results.len(), 3);
497 assert!(matches!(results[0], LiftoverResult::Mapped(_)));
498 assert_eq!(results[1], LiftoverResult::Unmapped);
499 assert_eq!(results[2], LiftoverResult::Unmapped);
500 }
501
502 #[test]
503 fn test_parse_empty_input() {
504 let cf = parse_chain("").unwrap();
505 assert!(cf.chains.is_empty());
506 }
507
508 #[test]
509 fn test_parse_chain_score_ordering() {
510 let input = "\
511chain 1000 chr1 2000 + 0 500 chrA 1000 + 0 400 1
512200 100 50
513200
514
515chain 5000 chr1 2000 + 0 800 chrB 1500 + 0 700 2
516400 100 50
517300
518";
519 let cf = parse_chain(input).unwrap();
520 let chains = &cf.chains["chr1"];
521 assert_eq!(chains.len(), 2);
522 assert_eq!(chains[0].score, 5000);
524 assert_eq!(chains[1].score, 1000);
525 }
526}