Skip to main content

cyanea_omics/
liftover.rs

1//! UCSC chain file parsing and genomic coordinate liftover.
2//!
3//! Remaps coordinates between genome assemblies using UCSC chain files.
4//!
5//! # Chain file format
6//!
7//! A chain file contains pairwise alignments between a source (target) and
8//! query genome. Each alignment is described by a header line followed by
9//! data lines specifying ungapped alignment blocks and the gaps between them.
10
11use std::collections::BTreeMap;
12
13use cyanea_core::{CyaneaError, Result};
14
15use crate::genomic::{GenomicInterval, Strand};
16
17/// A single ungapped alignment block within a chain.
18#[derive(Debug, Clone)]
19#[allow(dead_code)]
20struct AlignmentBlock {
21    source_start: u64,
22    source_end: u64,
23    target_start: u64,
24    target_end: u64,
25}
26
27/// A parsed chain alignment.
28#[derive(Debug, Clone)]
29#[allow(dead_code)]
30struct Chain {
31    score: u64,
32    source_chrom: String,
33    source_size: u64,
34    source_strand: Strand,
35    source_start: u64,
36    source_end: u64,
37    target_chrom: String,
38    target_size: u64,
39    target_strand: Strand,
40    target_start: u64,
41    target_end: u64,
42    blocks: Vec<AlignmentBlock>,
43}
44
45/// A parsed chain file, indexed by source chromosome.
46#[derive(Debug, Clone)]
47pub struct ChainFile {
48    chains: BTreeMap<String, Vec<Chain>>,
49}
50
51/// Result of a liftover operation for a single interval.
52#[derive(Debug, Clone, PartialEq)]
53pub enum LiftoverResult {
54    /// The interval was fully mapped to the target genome.
55    Mapped(GenomicInterval),
56    /// The interval was partially mapped.
57    Partial {
58        mapped: GenomicInterval,
59        fraction_mapped: f64,
60    },
61    /// The interval could not be mapped.
62    Unmapped,
63}
64
65fn parse_strand(s: &str) -> Result<Strand> {
66    match s {
67        "+" => Ok(Strand::Forward),
68        "-" => Ok(Strand::Reverse),
69        _ => Err(CyaneaError::Parse(format!(
70            "invalid strand '{s}'"
71        ))),
72    }
73}
74
75/// Parse a UCSC chain file from a string.
76///
77/// # Format
78///
79/// ```text
80/// chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id
81/// size dt dq
82/// size dt dq
83/// size
84/// ```
85///
86/// The header uses "target" for the reference (source) genome and "query"
87/// for the destination, matching UCSC convention. In our API, "source"
88/// corresponds to the chain target (the genome you're lifting FROM).
89pub fn parse_chain(input: &str) -> Result<ChainFile> {
90    let mut chains: BTreeMap<String, Vec<Chain>> = BTreeMap::new();
91    let mut current_chain: Option<Chain> = None;
92    let mut source_offset = 0u64;
93    let mut target_offset = 0u64;
94
95    for line in input.lines() {
96        let line = line.trim();
97        if line.is_empty() {
98            continue;
99        }
100
101        if line.starts_with("chain ") {
102            // Finalize previous chain
103            if let Some(chain) = current_chain.take() {
104                chains
105                    .entry(chain.source_chrom.clone())
106                    .or_default()
107                    .push(chain);
108            }
109
110            let fields: Vec<&str> = line.split_whitespace().collect();
111            if fields.len() < 12 {
112                return Err(CyaneaError::Parse(format!(
113                    "chain header requires at least 12 fields, got {}: {line}",
114                    fields.len()
115                )));
116            }
117
118            let score: u64 = fields[1]
119                .parse()
120                .map_err(|_| CyaneaError::Parse(format!("invalid score: {}", fields[1])))?;
121
122            // Target = source in our naming (genome we're lifting FROM)
123            let source_chrom = fields[2].to_string();
124            let source_size: u64 = fields[3].parse().map_err(|_| {
125                CyaneaError::Parse(format!("invalid source size: {}", fields[3]))
126            })?;
127            let source_strand = parse_strand(fields[4])?;
128            let source_start: u64 = fields[5].parse().map_err(|_| {
129                CyaneaError::Parse(format!("invalid source start: {}", fields[5]))
130            })?;
131            let source_end: u64 = fields[6].parse().map_err(|_| {
132                CyaneaError::Parse(format!("invalid source end: {}", fields[6]))
133            })?;
134
135            // Query = target in our naming (genome we're lifting TO)
136            let target_chrom = fields[7].to_string();
137            let target_size: u64 = fields[8].parse().map_err(|_| {
138                CyaneaError::Parse(format!("invalid target size: {}", fields[8]))
139            })?;
140            let target_strand = parse_strand(fields[9])?;
141            let target_start: u64 = fields[10].parse().map_err(|_| {
142                CyaneaError::Parse(format!("invalid target start: {}", fields[10]))
143            })?;
144            let target_end: u64 = fields[11].parse().map_err(|_| {
145                CyaneaError::Parse(format!("invalid target end: {}", fields[11]))
146            })?;
147
148            source_offset = source_start;
149            target_offset = target_start;
150
151            current_chain = Some(Chain {
152                score,
153                source_chrom,
154                source_size,
155                source_strand,
156                source_start,
157                source_end,
158                target_chrom,
159                target_size,
160                target_strand,
161                target_start,
162                target_end,
163                blocks: Vec::new(),
164            });
165
166            continue;
167        }
168
169        // Data line: size [dt dq]
170        if let Some(ref mut chain) = current_chain {
171            let fields: Vec<&str> = line.split_whitespace().collect();
172            if fields.is_empty() {
173                continue;
174            }
175
176            let size: u64 = fields[0].parse().map_err(|_| {
177                CyaneaError::Parse(format!("invalid block size: {}", fields[0]))
178            })?;
179
180            chain.blocks.push(AlignmentBlock {
181                source_start: source_offset,
182                source_end: source_offset + size,
183                target_start: target_offset,
184                target_end: target_offset + size,
185            });
186
187            if fields.len() >= 3 {
188                let dt: u64 = fields[1].parse().map_err(|_| {
189                    CyaneaError::Parse(format!("invalid dt: {}", fields[1]))
190                })?;
191                let dq: u64 = fields[2].parse().map_err(|_| {
192                    CyaneaError::Parse(format!("invalid dq: {}", fields[2]))
193                })?;
194                source_offset += size + dt;
195                target_offset += size + dq;
196            }
197            // Last line of chain has only `size` — no gap after
198        }
199    }
200
201    // Finalize last chain
202    if let Some(chain) = current_chain.take() {
203        chains
204            .entry(chain.source_chrom.clone())
205            .or_default()
206            .push(chain);
207    }
208
209    // Sort chains by score (descending) for each chromosome
210    for chains_vec in chains.values_mut() {
211        chains_vec.sort_by(|a, b| b.score.cmp(&a.score));
212    }
213
214    Ok(ChainFile { chains })
215}
216
217/// Convert a reverse-strand coordinate to forward-strand.
218fn reverse_to_forward(pos: u64, chrom_size: u64) -> u64 {
219    chrom_size - pos
220}
221
222/// Liftover a single interval using the chain file.
223///
224/// `min_match` is the minimum fraction of the interval that must map (0.0–1.0).
225/// Use 0.95 as a reasonable default.
226pub fn liftover(
227    chain_file: &ChainFile,
228    interval: &GenomicInterval,
229    min_match: f64,
230) -> LiftoverResult {
231    let chains = match chain_file.chains.get(&interval.chrom) {
232        Some(c) => c,
233        None => return LiftoverResult::Unmapped,
234    };
235
236    // Find the best chain that overlaps this interval (chains are sorted by score desc)
237    let chain = match chains.iter().find(|c| {
238        c.source_start <= interval.start && c.source_end >= interval.end
239    }) {
240        Some(c) => c,
241        None => {
242            // Try partial overlap with best-scoring chain
243            match chains.iter().find(|c| {
244                c.source_start < interval.end && c.source_end > interval.start
245            }) {
246                Some(c) => c,
247                None => return LiftoverResult::Unmapped,
248            }
249        }
250    };
251
252    // Find alignment blocks that overlap the query interval
253    let mut mapped_bp = 0u64;
254    let mut target_min = u64::MAX;
255    let mut target_max = 0u64;
256    let interval_len = interval.len();
257
258    for block in &chain.blocks {
259        if block.source_end <= interval.start || block.source_start >= interval.end {
260            continue;
261        }
262
263        // Compute overlap
264        let overlap_start = block.source_start.max(interval.start);
265        let overlap_end = block.source_end.min(interval.end);
266        let overlap_len = overlap_end - overlap_start;
267        mapped_bp += overlap_len;
268
269        // Map the overlap to target coordinates
270        let offset_in_block = overlap_start - block.source_start;
271        let t_start = block.target_start + offset_in_block;
272        let t_end = t_start + overlap_len;
273
274        target_min = target_min.min(t_start);
275        target_max = target_max.max(t_end);
276    }
277
278    if mapped_bp == 0 {
279        return LiftoverResult::Unmapped;
280    }
281
282    let fraction = mapped_bp as f64 / interval_len as f64;
283
284    // Handle reverse strand chains: convert target coordinates
285    let (final_start, final_end) = if chain.target_strand == Strand::Reverse {
286        let fwd_start = reverse_to_forward(target_max, chain.target_size);
287        let fwd_end = reverse_to_forward(target_min, chain.target_size);
288        (fwd_start, fwd_end)
289    } else {
290        (target_min, target_max)
291    };
292
293    // Construct the mapped interval (use the chain's target chrom)
294    let mapped = GenomicInterval {
295        chrom: chain.target_chrom.clone(),
296        start: final_start,
297        end: final_end,
298        strand: interval.strand,
299    };
300
301    if fraction >= min_match {
302        if (fraction - 1.0).abs() < 1e-10 {
303            LiftoverResult::Mapped(mapped)
304        } else {
305            LiftoverResult::Partial {
306                mapped,
307                fraction_mapped: fraction,
308            }
309        }
310    } else {
311        LiftoverResult::Unmapped
312    }
313}
314
315/// Batch liftover of multiple intervals.
316pub fn liftover_batch(
317    chain_file: &ChainFile,
318    intervals: &[GenomicInterval],
319    min_match: f64,
320) -> Vec<LiftoverResult> {
321    intervals
322        .iter()
323        .map(|iv| liftover(chain_file, iv, min_match))
324        .collect()
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    fn iv(chrom: &str, start: u64, end: u64) -> GenomicInterval {
332        GenomicInterval::new(chrom, start, end).unwrap()
333    }
334
335    // A simple chain: chr1 (1000bp) -> chrA (800bp), forward strand
336    // One chain with two alignment blocks:
337    //   Block 1: source [100, 300) -> target [50, 250)
338    //   Gap: dt=100, dq=50
339    //   Block 2: source [400, 600) -> target [300, 500)
340    const SIMPLE_CHAIN: &str = "\
341chain 1000 chr1 1000 + 100 600 chrA 800 + 50 500 1
342200 100 50
343200
344";
345
346    #[test]
347    fn test_parse_basic_chain() {
348        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
349        assert!(cf.chains.contains_key("chr1"));
350        let chains = &cf.chains["chr1"];
351        assert_eq!(chains.len(), 1);
352        assert_eq!(chains[0].score, 1000);
353        assert_eq!(chains[0].source_chrom, "chr1");
354        assert_eq!(chains[0].target_chrom, "chrA");
355        assert_eq!(chains[0].blocks.len(), 2);
356
357        // Block 1
358        assert_eq!(chains[0].blocks[0].source_start, 100);
359        assert_eq!(chains[0].blocks[0].source_end, 300);
360        assert_eq!(chains[0].blocks[0].target_start, 50);
361        assert_eq!(chains[0].blocks[0].target_end, 250);
362
363        // Block 2
364        assert_eq!(chains[0].blocks[1].source_start, 400);
365        assert_eq!(chains[0].blocks[1].source_end, 600);
366        assert_eq!(chains[0].blocks[1].target_start, 300);
367        assert_eq!(chains[0].blocks[1].target_end, 500);
368    }
369
370    #[test]
371    fn test_parse_multiple_chains() {
372        let input = "\
373chain 5000 chr1 1000 + 0 500 chrA 800 + 0 400 1
374200 100 50
375200
376
377chain 3000 chr2 2000 + 100 400 chrB 1500 + 200 500 2
378300
379";
380        let cf = parse_chain(input).unwrap();
381        assert!(cf.chains.contains_key("chr1"));
382        assert!(cf.chains.contains_key("chr2"));
383        assert_eq!(cf.chains["chr2"][0].blocks.len(), 1);
384    }
385
386    #[test]
387    fn test_parse_reverse_strand() {
388        let input = "\
389chain 2000 chr1 1000 + 100 400 chrA 800 - 300 600 1
390300
391";
392        let cf = parse_chain(input).unwrap();
393        let chain = &cf.chains["chr1"][0];
394        assert_eq!(chain.source_strand, Strand::Forward);
395        assert_eq!(chain.target_strand, Strand::Reverse);
396    }
397
398    #[test]
399    fn test_parse_invalid_format() {
400        let input = "chain 1000 chr1";
401        assert!(parse_chain(input).is_err());
402    }
403
404    #[test]
405    fn test_liftover_within_single_block() {
406        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
407        // Query [150, 250) is within block 1: source [100,300) -> target [50,250)
408        // offset = 150-100 = 50, so target [100, 200)
409        let result = liftover(&cf, &iv("chr1", 150, 250), 0.95);
410        match result {
411            LiftoverResult::Mapped(mapped) => {
412                assert_eq!(mapped.chrom, "chrA");
413                assert_eq!(mapped.start, 100);
414                assert_eq!(mapped.end, 200);
415            }
416            other => panic!("expected Mapped, got {:?}", other),
417        }
418    }
419
420    #[test]
421    fn test_liftover_spanning_blocks() {
422        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
423        // Query [200, 500) spans block 1 [100,300) and block 2 [400,600)
424        // Block 1 overlap: [200,300) = 100bp, mapped to target [150,250)
425        // Gap: source [300,400) = unmapped
426        // Block 2 overlap: [400,500) = 100bp, mapped to target [300,400)
427        // Total mapped = 200bp out of 300bp = 0.667
428        let result = liftover(&cf, &iv("chr1", 200, 500), 0.5);
429        match result {
430            LiftoverResult::Partial {
431                mapped,
432                fraction_mapped,
433            } => {
434                assert_eq!(mapped.chrom, "chrA");
435                assert_eq!(mapped.start, 150);
436                assert_eq!(mapped.end, 400);
437                assert!((fraction_mapped - 200.0 / 300.0).abs() < 1e-10);
438            }
439            other => panic!("expected Partial, got {:?}", other),
440        }
441    }
442
443    #[test]
444    fn test_liftover_in_gap() {
445        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
446        // Query [310, 390) is in the gap between blocks
447        let result = liftover(&cf, &iv("chr1", 310, 390), 0.95);
448        assert_eq!(result, LiftoverResult::Unmapped);
449    }
450
451    #[test]
452    fn test_liftover_reverse_strand() {
453        let input = "\
454chain 2000 chr1 1000 + 100 400 chrA 800 - 300 600 1
455300
456";
457        let cf = parse_chain(input).unwrap();
458        // Source [100,400) -> target reverse strand [300,600) on chrA (size=800)
459        // Block: source [100,400) -> target [300,600) in reverse coords
460        // Query [150,250) -> offset=50, target_start=350, target_end=450
461        // Reverse conversion: fwd_start = 800-450=350, fwd_end = 800-350=450
462        let result = liftover(&cf, &iv("chr1", 150, 250), 0.95);
463        match result {
464            LiftoverResult::Mapped(mapped) => {
465                assert_eq!(mapped.chrom, "chrA");
466                assert_eq!(mapped.start, 350);
467                assert_eq!(mapped.end, 450);
468            }
469            other => panic!("expected Mapped, got {:?}", other),
470        }
471    }
472
473    #[test]
474    fn test_liftover_no_chain_for_chrom() {
475        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
476        let result = liftover(&cf, &iv("chrX", 100, 200), 0.95);
477        assert_eq!(result, LiftoverResult::Unmapped);
478    }
479
480    #[test]
481    fn test_liftover_min_match_threshold() {
482        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
483        // [200, 500) maps 200/300 = 0.667
484        let strict = liftover(&cf, &iv("chr1", 200, 500), 0.95);
485        assert_eq!(strict, LiftoverResult::Unmapped);
486
487        let lenient = liftover(&cf, &iv("chr1", 200, 500), 0.5);
488        assert!(matches!(lenient, LiftoverResult::Partial { .. }));
489    }
490
491    #[test]
492    fn test_liftover_batch_consistency() {
493        let cf = parse_chain(SIMPLE_CHAIN).unwrap();
494        let intervals = vec![iv("chr1", 150, 250), iv("chr1", 310, 390), iv("chrX", 0, 100)];
495        let results = liftover_batch(&cf, &intervals, 0.95);
496        assert_eq!(results.len(), 3);
497        assert!(matches!(results[0], LiftoverResult::Mapped(_)));
498        assert_eq!(results[1], LiftoverResult::Unmapped);
499        assert_eq!(results[2], LiftoverResult::Unmapped);
500    }
501
502    #[test]
503    fn test_parse_empty_input() {
504        let cf = parse_chain("").unwrap();
505        assert!(cf.chains.is_empty());
506    }
507
508    #[test]
509    fn test_parse_chain_score_ordering() {
510        let input = "\
511chain 1000 chr1 2000 + 0 500 chrA 1000 + 0 400 1
512200 100 50
513200
514
515chain 5000 chr1 2000 + 0 800 chrB 1500 + 0 700 2
516400 100 50
517300
518";
519        let cf = parse_chain(input).unwrap();
520        let chains = &cf.chains["chr1"];
521        assert_eq!(chains.len(), 2);
522        // Higher score should be first
523        assert_eq!(chains[0].score, 5000);
524        assert_eq!(chains[1].score, 1000);
525    }
526}