Skip to main content

ferro_hgvs/spdi/
convert.rs

1//! HGVS to SPDI conversion.
2//!
3//! This module provides conversion functions between HGVS and SPDI formats.
4//!
5//! # Coordinate Systems
6//!
7//! - HGVS uses 1-based coordinates (see [`OneBasedPos`])
8//! - SPDI uses 0-based interbase coordinates (see [`ZeroBasedPos`])
9//!
10//! For a variant at genomic position 12345 (1-based):
11//! - HGVS: `NC_000001.11:g.12345A>G`
12//! - SPDI: `NC_000001.11:12344:A:G` (0-based)
13//!
14//! # Supported Conversions
15//!
16//! | HGVS | SPDI |
17//! |------|------|
18//! | Substitution `g.12345A>G` | `seq:12344:A:G` |
19//! | Deletion `g.100_102del` | `seq:99:NNN:` (requires reference) |
20//! | Insertion `g.100_101insATG` | `seq:100::ATG` |
21//! | Delins `g.100_102delinsATG` | `seq:99:NNN:ATG` (requires reference) |
22//! | Duplication `g.100_102dup` | `seq:102::NNN` (requires reference) |
23//!
24//! Note: Deletions, delins, and duplications require reference sequence data
25//! to determine the deleted sequence.
26//!
27//! [`OneBasedPos`]: crate::coords::OneBasedPos
28//! [`ZeroBasedPos`]: crate::coords::ZeroBasedPos
29
30use super::SpdiVariant;
31use crate::coords::{OneBasedPos, ZeroBasedPos};
32use crate::error::FerroError;
33use crate::hgvs::edit::{InsertedSequence, NaEdit, Sequence};
34use crate::hgvs::interval::Interval;
35use crate::hgvs::location::GenomePos;
36use crate::hgvs::parser::accession::parse_accession;
37use crate::hgvs::variant::{GenomeVariant, HgvsVariant, LocEdit};
38
39/// Error type for conversion failures.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub enum ConversionError {
42    /// The variant type is not supported for conversion.
43    UnsupportedVariantType {
44        /// Description of the unsupported type.
45        description: String,
46    },
47    /// Missing reference sequence data needed for conversion.
48    MissingReferenceData {
49        /// Description of what data is missing.
50        description: String,
51    },
52    /// The edit type is not supported for conversion.
53    UnsupportedEditType {
54        /// Description of the unsupported edit.
55        description: String,
56    },
57    /// Invalid position or interval.
58    InvalidPosition {
59        /// Description of the position error.
60        description: String,
61    },
62    /// Invalid accession format.
63    InvalidAccession {
64        /// Description of the accession error.
65        description: String,
66    },
67}
68
69impl std::fmt::Display for ConversionError {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        match self {
72            ConversionError::UnsupportedVariantType { description } => {
73                write!(
74                    f,
75                    "unsupported variant type for conversion: {}",
76                    description
77                )
78            }
79            ConversionError::MissingReferenceData { description } => {
80                write!(f, "missing reference data: {}", description)
81            }
82            ConversionError::UnsupportedEditType { description } => {
83                write!(f, "unsupported edit type for conversion: {}", description)
84            }
85            ConversionError::InvalidPosition { description } => {
86                write!(f, "invalid position: {}", description)
87            }
88            ConversionError::InvalidAccession { description } => {
89                write!(f, "invalid accession: {}", description)
90            }
91        }
92    }
93}
94
95impl std::error::Error for ConversionError {}
96
97impl From<ConversionError> for FerroError {
98    fn from(err: ConversionError) -> Self {
99        FerroError::ConversionError {
100            msg: err.to_string(),
101        }
102    }
103}
104
105/// Helper to convert a Sequence to a String.
106fn sequence_to_string(seq: &Sequence) -> String {
107    seq.to_string()
108}
109
110/// Helper to convert an InsertedSequence to a String (for literal sequences only).
111fn inserted_sequence_to_string(seq: &InsertedSequence) -> Option<String> {
112    match seq {
113        InsertedSequence::Literal(s) => Some(s.to_string()),
114        _ => None,
115    }
116}
117
118/// Helper to get start position from an interval.
119fn get_start_pos(interval: &Interval<GenomePos>) -> Option<u64> {
120    interval.start.inner().map(|p| p.base)
121}
122
123/// Helper to get end position from an interval.
124fn get_end_pos(interval: &Interval<GenomePos>) -> Option<u64> {
125    interval.end.inner().map(|p| p.base)
126}
127
128/// Convert a genomic HGVS variant to SPDI format.
129///
130/// This is a "simple" conversion that works for substitutions where the
131/// reference and alternate sequences are explicitly stated in the HGVS.
132/// For deletions, insertions, and duplications, use the version that takes
133/// a reference provider.
134///
135/// # Arguments
136///
137/// * `variant` - A genomic HGVS variant (g. coordinate system)
138///
139/// # Returns
140///
141/// * `Ok(SpdiVariant)` - Successfully converted variant
142/// * `Err(ConversionError)` - Conversion failed
143///
144/// # Examples
145///
146/// ```
147/// use ferro_hgvs::spdi::convert::hgvs_to_spdi_simple;
148/// use ferro_hgvs::parse_hgvs;
149///
150/// let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
151/// let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
152/// assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
153/// ```
154pub fn hgvs_to_spdi_simple(variant: &HgvsVariant) -> Result<SpdiVariant, ConversionError> {
155    match variant {
156        HgvsVariant::Genome(g) => genome_to_spdi_simple(g),
157        _ => Err(ConversionError::UnsupportedVariantType {
158            description: format!(
159                "only genomic (g.) variants can be directly converted to SPDI, got {}",
160                variant.variant_type()
161            ),
162        }),
163    }
164}
165
166/// Convert a genomic variant to SPDI (simple conversion).
167fn genome_to_spdi_simple(variant: &GenomeVariant) -> Result<SpdiVariant, ConversionError> {
168    let sequence = variant.accession.to_string();
169    let interval = &variant.loc_edit.location;
170
171    // Get the edit (unwrap from Mu)
172    let edit = variant
173        .loc_edit
174        .edit
175        .inner()
176        .ok_or_else(|| ConversionError::InvalidPosition {
177            description: "cannot convert variant with unknown edit".to_string(),
178        })?;
179
180    // Get start position
181    let start_pos = get_start_pos(interval).ok_or_else(|| ConversionError::InvalidPosition {
182        description: "cannot convert variant with unknown start position".to_string(),
183    })?;
184
185    // Validate and wrap in type-safe 1-based position
186    let hgvs_pos_ob =
187        OneBasedPos::try_new(start_pos).ok_or_else(|| ConversionError::InvalidPosition {
188            description: "position 0 is not valid in HGVS".to_string(),
189        })?;
190
191    // Convert 1-based HGVS position to 0-based SPDI position using type-safe conversion
192    let spdi_pos_zb: ZeroBasedPos = hgvs_pos_ob.to_zero_based();
193    let spdi_pos = spdi_pos_zb.value();
194
195    match edit {
196        NaEdit::Substitution {
197            reference,
198            alternative,
199        } => {
200            // Substitution: g.12345A>G -> seq:12344:A:G
201            Ok(SpdiVariant::new(
202                sequence,
203                spdi_pos,
204                reference.to_string(),
205                alternative.to_string(),
206            ))
207        }
208        NaEdit::Insertion { sequence: inserted } => {
209            // Insertion: g.100_101insATG -> seq:100::ATG
210            // For insertion, HGVS uses positions flanking the insertion point
211            // SPDI position is the 0-based position where insertion happens
212            let ins_str = inserted_sequence_to_string(inserted).ok_or_else(|| {
213                ConversionError::MissingReferenceData {
214                    description: "insertion sequence is not a literal sequence".to_string(),
215                }
216            })?;
217            Ok(SpdiVariant::new(sequence, spdi_pos, "", ins_str))
218        }
219        NaEdit::Duplication {
220            sequence: dup_seq, ..
221        } => {
222            // Duplication with sequence: g.100_102dupATG -> seq:102::ATG
223            // (insertion of the duplicated sequence after the original)
224            if let Some(seq) = dup_seq {
225                // Position for dup is at the end of the duplicated region
226                let end_pos = get_end_pos(interval).unwrap_or(start_pos);
227                // Convert 1-based HGVS end position to 0-based SPDI position
228                let end_pos_ob = OneBasedPos::new(end_pos);
229                let spdi_end_zb = end_pos_ob.to_zero_based();
230                Ok(SpdiVariant::new(
231                    sequence,
232                    spdi_end_zb.value(),
233                    "",
234                    sequence_to_string(seq),
235                ))
236            } else {
237                Err(ConversionError::MissingReferenceData {
238                    description:
239                        "duplication sequence not provided; reference data needed to determine duplicated bases"
240                            .to_string(),
241                })
242            }
243        }
244        NaEdit::Deletion {
245            sequence: del_seq, ..
246        } => {
247            // Deletion: g.100_102del -> seq:99:ATG:
248            // Need the deleted sequence
249            if let Some(seq) = del_seq {
250                Ok(SpdiVariant::new(
251                    sequence,
252                    spdi_pos,
253                    sequence_to_string(seq),
254                    "",
255                ))
256            } else {
257                Err(ConversionError::MissingReferenceData {
258                    description:
259                        "deleted sequence not provided; reference data needed to determine deleted bases"
260                            .to_string(),
261                })
262            }
263        }
264        NaEdit::Delins { sequence: _ins_seq } => {
265            // Delins: g.100_102delinsATG -> seq:99:XYZ:ATG
266            // We need to know the deleted sequence to create a valid SPDI
267            // Without reference data, we cannot determine what was deleted
268            // Calculate deletion length from interval (saturating to avoid overflow)
269            let end_pos = get_end_pos(interval).unwrap_or(start_pos);
270            let del_len = (end_pos - start_pos).saturating_add(1) as usize;
271
272            // Without reference data, we can't know the deleted sequence
273            Err(ConversionError::MissingReferenceData {
274                description: format!(
275                    "Cannot convert delins to SPDI: deleted sequence of length {} is unknown (no reference data)",
276                    del_len
277                ),
278            })
279        }
280        NaEdit::Identity {
281            sequence: id_seq, ..
282        } => {
283            // Identity: g.100= or g.100A=
284            let ref_base = id_seq.as_ref().map(sequence_to_string).unwrap_or_default();
285            Ok(SpdiVariant::new(
286                sequence,
287                spdi_pos,
288                ref_base.clone(),
289                ref_base,
290            ))
291        }
292        NaEdit::Inversion { .. } => Err(ConversionError::UnsupportedEditType {
293            description: "inversions cannot be represented in SPDI format".to_string(),
294        }),
295        NaEdit::Repeat { .. } => Err(ConversionError::UnsupportedEditType {
296            description: "repeat variants cannot be directly converted to SPDI".to_string(),
297        }),
298        NaEdit::CopyNumber { .. } => Err(ConversionError::UnsupportedEditType {
299            description: "copy number variants cannot be represented in SPDI format".to_string(),
300        }),
301        NaEdit::Conversion { .. } => Err(ConversionError::UnsupportedEditType {
302            description: "conversion variants cannot be represented in SPDI format".to_string(),
303        }),
304        _ => Err(ConversionError::UnsupportedEditType {
305            description: format!("unsupported edit type: {:?}", edit),
306        }),
307    }
308}
309
310/// Convert an SPDI variant to HGVS genomic format.
311///
312/// # Arguments
313///
314/// * `spdi` - The SPDI variant to convert
315///
316/// # Returns
317///
318/// * `Ok(HgvsVariant)` - Successfully converted variant
319/// * `Err(ConversionError)` - Conversion failed
320///
321/// # Examples
322///
323/// ```
324/// use ferro_hgvs::spdi::{SpdiVariant, convert::spdi_to_hgvs};
325///
326/// let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
327/// let hgvs = spdi_to_hgvs(&spdi).unwrap();
328/// assert_eq!(hgvs.to_string(), "NC_000001.11:g.12345A>G");
329/// ```
330pub fn spdi_to_hgvs(spdi: &SpdiVariant) -> Result<HgvsVariant, ConversionError> {
331    // Parse the accession using the HGVS parser
332    let accession = parse_accession(&spdi.sequence)
333        .map(|(_, acc)| acc)
334        .map_err(|_| ConversionError::InvalidAccession {
335            description: format!("could not parse accession: {}", spdi.sequence),
336        })?;
337
338    // Convert 0-based SPDI position to 1-based HGVS position using type-safe conversion
339    let spdi_pos_zb = ZeroBasedPos::new(spdi.position);
340    let hgvs_pos_ob = spdi_pos_zb.to_one_based();
341    let hgvs_pos = hgvs_pos_ob.value();
342
343    // Determine the edit type based on deletion and insertion
344    let (interval, edit) = if spdi.is_identity() {
345        // Identity
346        let seq = if spdi.deletion.is_empty() {
347            None
348        } else {
349            Some(string_to_sequence(&spdi.deletion)?)
350        };
351        (
352            Interval::point(GenomePos::new(hgvs_pos)),
353            NaEdit::Identity {
354                sequence: seq,
355                whole_entity: false,
356            },
357        )
358    } else if spdi.deletion.len() == 1 && spdi.insertion.len() == 1 {
359        // SNV substitution
360        let ref_base = char_to_base(spdi.deletion.chars().next().unwrap())?;
361        let alt_base = char_to_base(spdi.insertion.chars().next().unwrap())?;
362        (
363            Interval::point(GenomePos::new(hgvs_pos)),
364            NaEdit::Substitution {
365                reference: ref_base,
366                alternative: alt_base,
367            },
368        )
369    } else if spdi.is_deletion() {
370        // Pure deletion
371        let del_len = spdi.deletion.len();
372        let del_seq = string_to_sequence(&spdi.deletion)?;
373        let interval = if del_len > 1 {
374            Interval::new(
375                GenomePos::new(hgvs_pos),
376                GenomePos::new(hgvs_pos + del_len as u64 - 1),
377            )
378        } else {
379            Interval::point(GenomePos::new(hgvs_pos))
380        };
381        (
382            interval,
383            NaEdit::Deletion {
384                sequence: Some(del_seq),
385                length: None,
386            },
387        )
388    } else if spdi.is_insertion() {
389        // Pure insertion
390        // SPDI position is where insertion happens (interbase)
391        // HGVS: g.pos_pos+1insX means insert between pos and pos+1
392        let ins_seq = string_to_sequence(&spdi.insertion)?;
393        (
394            Interval::new(GenomePos::new(hgvs_pos), GenomePos::new(hgvs_pos + 1)),
395            NaEdit::Insertion {
396                sequence: InsertedSequence::Literal(ins_seq),
397            },
398        )
399    } else {
400        // Delins (different lengths or MNV)
401        let del_len = spdi.deletion.len();
402        let ins_seq = string_to_sequence(&spdi.insertion)?;
403        let interval = if del_len > 1 {
404            Interval::new(
405                GenomePos::new(hgvs_pos),
406                GenomePos::new(hgvs_pos + del_len as u64 - 1),
407            )
408        } else {
409            Interval::point(GenomePos::new(hgvs_pos))
410        };
411        (
412            interval,
413            NaEdit::Delins {
414                sequence: InsertedSequence::Literal(ins_seq),
415            },
416        )
417    };
418
419    Ok(HgvsVariant::Genome(GenomeVariant {
420        accession,
421        gene_symbol: None,
422        loc_edit: LocEdit::new(interval, edit),
423    }))
424}
425
426/// Helper to convert a string to a Sequence.
427fn string_to_sequence(s: &str) -> Result<Sequence, ConversionError> {
428    s.parse().map_err(|_| ConversionError::InvalidPosition {
429        description: format!("invalid sequence: {}", s),
430    })
431}
432
433/// Helper to convert a char to a Base.
434fn char_to_base(c: char) -> Result<crate::hgvs::edit::Base, ConversionError> {
435    crate::hgvs::edit::Base::from_char(c).ok_or_else(|| ConversionError::InvalidPosition {
436        description: format!("invalid base character: {}", c),
437    })
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443    use crate::hgvs::parser::parse_hgvs;
444
445    // HGVS to SPDI tests
446
447    #[test]
448    fn test_hgvs_to_spdi_substitution() {
449        let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
450        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
451        assert_eq!(spdi.sequence, "NC_000001.11");
452        assert_eq!(spdi.position, 12344);
453        assert_eq!(spdi.deletion, "A");
454        assert_eq!(spdi.insertion, "G");
455    }
456
457    #[test]
458    fn test_hgvs_to_spdi_insertion() {
459        let hgvs = parse_hgvs("NC_000001.11:g.100_101insATG").unwrap();
460        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
461        assert_eq!(spdi.position, 99); // 0-based, position before insertion
462        assert_eq!(spdi.deletion, "");
463        assert_eq!(spdi.insertion, "ATG");
464    }
465
466    #[test]
467    fn test_hgvs_to_spdi_deletion_with_seq() {
468        let hgvs = parse_hgvs("NC_000001.11:g.100_102delATG").unwrap();
469        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
470        assert_eq!(spdi.position, 99);
471        assert_eq!(spdi.deletion, "ATG");
472        assert_eq!(spdi.insertion, "");
473    }
474
475    #[test]
476    fn test_hgvs_to_spdi_deletion_without_seq() {
477        let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
478        let result = hgvs_to_spdi_simple(&hgvs);
479        assert!(matches!(
480            result,
481            Err(ConversionError::MissingReferenceData { .. })
482        ));
483    }
484
485    #[test]
486    fn test_hgvs_to_spdi_delins_without_ref() {
487        // Without reference data, delins with unknown deletion sequence returns error
488        let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
489        let result = hgvs_to_spdi_simple(&hgvs);
490        assert!(result.is_err());
491        assert!(result.unwrap_err().to_string().contains("unknown"));
492    }
493
494    #[test]
495    fn test_hgvs_to_spdi_duplication_with_seq() {
496        let hgvs = parse_hgvs("NC_000001.11:g.100_102dupATG").unwrap();
497        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
498        // Dup becomes insertion after the duplicated region
499        assert_eq!(spdi.position, 101); // end position, 0-based
500        assert_eq!(spdi.deletion, "");
501        assert_eq!(spdi.insertion, "ATG");
502    }
503
504    #[test]
505    fn test_hgvs_to_spdi_identity() {
506        let hgvs = parse_hgvs("NC_000001.11:g.100A=").unwrap();
507        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
508        assert_eq!(spdi.position, 99);
509        assert_eq!(spdi.deletion, "A");
510        assert_eq!(spdi.insertion, "A");
511    }
512
513    #[test]
514    fn test_hgvs_to_spdi_unsupported_coding() {
515        let hgvs = parse_hgvs("NM_000088.3:c.100A>G").unwrap();
516        let result = hgvs_to_spdi_simple(&hgvs);
517        assert!(matches!(
518            result,
519            Err(ConversionError::UnsupportedVariantType { .. })
520        ));
521    }
522
523    #[test]
524    fn test_hgvs_to_spdi_unsupported_inversion() {
525        let hgvs = parse_hgvs("NC_000001.11:g.100_200inv").unwrap();
526        let result = hgvs_to_spdi_simple(&hgvs);
527        assert!(matches!(
528            result,
529            Err(ConversionError::UnsupportedEditType { .. })
530        ));
531    }
532
533    // SPDI to HGVS tests
534
535    #[test]
536    fn test_spdi_to_hgvs_substitution() {
537        let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
538        let hgvs = spdi_to_hgvs(&spdi).unwrap();
539        assert_eq!(hgvs.to_string(), "NC_000001.11:g.12345A>G");
540    }
541
542    #[test]
543    fn test_spdi_to_hgvs_deletion() {
544        let spdi = SpdiVariant::deletion("NC_000001.11", 99, "ATG");
545        let hgvs = spdi_to_hgvs(&spdi).unwrap();
546        assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delATG");
547    }
548
549    #[test]
550    fn test_spdi_to_hgvs_insertion() {
551        let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
552        let hgvs = spdi_to_hgvs(&spdi).unwrap();
553        assert_eq!(hgvs.to_string(), "NC_000001.11:g.101_102insATG");
554    }
555
556    #[test]
557    fn test_spdi_to_hgvs_delins() {
558        let spdi = SpdiVariant::delins("NC_000001.11", 99, "ATG", "TTCC");
559        let hgvs = spdi_to_hgvs(&spdi).unwrap();
560        assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delinsTTCC");
561    }
562
563    #[test]
564    fn test_spdi_to_hgvs_identity() {
565        let spdi = SpdiVariant::new("NC_000001.11", 99, "A", "A");
566        let hgvs = spdi_to_hgvs(&spdi).unwrap();
567        assert_eq!(hgvs.to_string(), "NC_000001.11:g.100A=");
568    }
569
570    #[test]
571    fn test_spdi_to_hgvs_single_del() {
572        let spdi = SpdiVariant::deletion("NC_000001.11", 99, "A");
573        let hgvs = spdi_to_hgvs(&spdi).unwrap();
574        assert_eq!(hgvs.to_string(), "NC_000001.11:g.100delA");
575    }
576
577    // Roundtrip tests
578
579    #[test]
580    fn test_roundtrip_substitution() {
581        let original = "NC_000001.11:g.12345A>G";
582        let hgvs = parse_hgvs(original).unwrap();
583        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
584        let back = spdi_to_hgvs(&spdi).unwrap();
585        assert_eq!(back.to_string(), original);
586    }
587
588    #[test]
589    fn test_roundtrip_insertion() {
590        let original = "NC_000001.11:g.100_101insATG";
591        let hgvs = parse_hgvs(original).unwrap();
592        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
593        let back = spdi_to_hgvs(&spdi).unwrap();
594        assert_eq!(back.to_string(), original);
595    }
596
597    #[test]
598    fn test_roundtrip_deletion_with_seq() {
599        let original = "NC_000001.11:g.100_102delATG";
600        let hgvs = parse_hgvs(original).unwrap();
601        let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
602        let back = spdi_to_hgvs(&spdi).unwrap();
603        assert_eq!(back.to_string(), original);
604    }
605
606    #[test]
607    fn test_error_display() {
608        let err = ConversionError::UnsupportedVariantType {
609            description: "test".to_string(),
610        };
611        assert!(err.to_string().contains("unsupported variant type"));
612
613        let err = ConversionError::MissingReferenceData {
614            description: "test".to_string(),
615        };
616        assert!(err.to_string().contains("missing reference data"));
617    }
618
619    // =========================================================================
620    // P1: Reference-dependent conversion tests
621    // =========================================================================
622
623    /// Mock genomic reference for testing reference-dependent conversions.
624    struct MockGenomicRef {
625        /// Map of (accession, start, end) -> sequence
626        sequences: std::collections::HashMap<(String, u64, u64), String>,
627    }
628
629    impl MockGenomicRef {
630        fn new() -> Self {
631            let mut sequences = std::collections::HashMap::new();
632
633            // Add test sequences for NC_000001.11
634            // Positions are 0-based for internal storage
635            // Sequence at g.100_102 (1-based) = indices 99-101 (0-based) = "ATG"
636            sequences.insert(("NC_000001.11".to_string(), 99, 102), "ATG".to_string());
637            // Sequence at g.200_205 (1-based) = indices 199-204 (0-based) = "GATTACA"
638            sequences.insert(
639                ("NC_000001.11".to_string(), 199, 206),
640                "GATTACA".to_string(),
641            );
642            // Sequence at g.1000_1009 (1-based) = "AAACCCGGGT"
643            sequences.insert(
644                ("NC_000001.11".to_string(), 999, 1009),
645                "AAACCCGGGT".to_string(),
646            );
647
648            Self { sequences }
649        }
650
651        fn get_sequence(&self, accession: &str, start: u64, end: u64) -> Option<String> {
652            self.sequences
653                .get(&(accession.to_string(), start, end))
654                .cloned()
655        }
656    }
657
658    /// Convert deletion HGVS to SPDI using reference data.
659    fn hgvs_to_spdi_with_ref(
660        variant: &HgvsVariant,
661        reference: &MockGenomicRef,
662    ) -> Result<SpdiVariant, ConversionError> {
663        match variant {
664            HgvsVariant::Genome(g) => {
665                let sequence = g.accession.to_string();
666                let interval = &g.loc_edit.location;
667
668                let edit =
669                    g.loc_edit
670                        .edit
671                        .inner()
672                        .ok_or_else(|| ConversionError::InvalidPosition {
673                            description: "cannot convert variant with unknown edit".to_string(),
674                        })?;
675
676                let start_pos =
677                    get_start_pos(interval).ok_or_else(|| ConversionError::InvalidPosition {
678                        description: "cannot convert variant with unknown start position"
679                            .to_string(),
680                    })?;
681                let end_pos = get_end_pos(interval).unwrap_or(start_pos);
682
683                // Convert 1-based HGVS position to 0-based SPDI position
684                let start_pos_ob = OneBasedPos::new(start_pos);
685                let spdi_pos = start_pos_ob.to_zero_based().value();
686
687                match edit {
688                    NaEdit::Deletion { sequence: None, .. } => {
689                        // Fetch sequence from reference
690                        let del_seq = reference
691                            .get_sequence(&sequence, spdi_pos, end_pos)
692                            .ok_or_else(|| ConversionError::MissingReferenceData {
693                                description: format!(
694                                    "could not fetch sequence for {}:{}-{}",
695                                    sequence, start_pos, end_pos
696                                ),
697                            })?;
698                        Ok(SpdiVariant::new(sequence, spdi_pos, del_seq, ""))
699                    }
700                    NaEdit::Delins { sequence: ins_seq } => {
701                        // Fetch deleted sequence from reference
702                        let del_seq = reference
703                            .get_sequence(&sequence, spdi_pos, end_pos)
704                            .ok_or_else(|| ConversionError::MissingReferenceData {
705                                description: format!(
706                                    "could not fetch sequence for {}:{}-{}",
707                                    sequence, start_pos, end_pos
708                                ),
709                            })?;
710                        let ins_str = inserted_sequence_to_string(ins_seq).ok_or_else(|| {
711                            ConversionError::MissingReferenceData {
712                                description: "delins inserted sequence is not a literal"
713                                    .to_string(),
714                            }
715                        })?;
716                        Ok(SpdiVariant::new(sequence, spdi_pos, del_seq, ins_str))
717                    }
718                    NaEdit::Duplication { sequence: None, .. } => {
719                        // Fetch duplicated sequence from reference
720                        let dup_seq = reference
721                            .get_sequence(&sequence, spdi_pos, end_pos)
722                            .ok_or_else(|| ConversionError::MissingReferenceData {
723                                description: format!(
724                                    "could not fetch sequence for {}:{}-{}",
725                                    sequence, start_pos, end_pos
726                                ),
727                            })?;
728                        // Duplication becomes insertion after the region
729                        // Convert 1-based HGVS end position to 0-based SPDI position
730                        let end_pos_ob = OneBasedPos::new(end_pos);
731                        let spdi_end = end_pos_ob.to_zero_based().value();
732                        Ok(SpdiVariant::new(sequence, spdi_end, "", dup_seq))
733                    }
734                    // Fall back to simple conversion for other types
735                    _ => hgvs_to_spdi_simple(variant),
736                }
737            }
738            _ => hgvs_to_spdi_simple(variant),
739        }
740    }
741
742    #[test]
743    fn test_deletion_without_seq_with_reference() {
744        let reference = MockGenomicRef::new();
745        let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
746
747        // This would fail with simple conversion
748        assert!(hgvs_to_spdi_simple(&hgvs).is_err());
749
750        // But works with reference data
751        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
752        assert_eq!(spdi.sequence, "NC_000001.11");
753        assert_eq!(spdi.position, 99);
754        assert_eq!(spdi.deletion, "ATG");
755        assert_eq!(spdi.insertion, "");
756    }
757
758    #[test]
759    fn test_delins_with_reference_provides_actual_deletion() {
760        let reference = MockGenomicRef::new();
761        let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
762
763        // Simple conversion without reference data returns an error
764        let simple = hgvs_to_spdi_simple(&hgvs);
765        assert!(simple.is_err());
766        assert!(simple.unwrap_err().to_string().contains("unknown"));
767
768        // Reference-based conversion gets actual sequence
769        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
770        assert_eq!(spdi.deletion, "ATG");
771        assert_eq!(spdi.insertion, "TTCC");
772    }
773
774    #[test]
775    fn test_delins_roundtrip_with_reference() {
776        let reference = MockGenomicRef::new();
777        let original_hgvs = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
778
779        // Convert to SPDI with reference
780        let spdi = hgvs_to_spdi_with_ref(&original_hgvs, &reference).unwrap();
781        assert_eq!(spdi.deletion, "ATG");
782        assert_eq!(spdi.insertion, "TTCC");
783
784        // Convert back to HGVS
785        let back = spdi_to_hgvs(&spdi).unwrap();
786        assert_eq!(back.to_string(), "NC_000001.11:g.100_102delinsTTCC");
787    }
788
789    #[test]
790    fn test_deletion_roundtrip_with_reference() {
791        let reference = MockGenomicRef::new();
792        let original_hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
793
794        // Convert to SPDI with reference
795        let spdi = hgvs_to_spdi_with_ref(&original_hgvs, &reference).unwrap();
796        assert_eq!(spdi.deletion, "ATG");
797        assert_eq!(spdi.insertion, "");
798
799        // Convert back to HGVS
800        let back = spdi_to_hgvs(&spdi).unwrap();
801        // Note: HGVS back includes the sequence
802        assert_eq!(back.to_string(), "NC_000001.11:g.100_102delATG");
803    }
804
805    #[test]
806    fn test_duplication_without_seq_with_reference() {
807        let reference = MockGenomicRef::new();
808        let hgvs = parse_hgvs("NC_000001.11:g.100_102dup").unwrap();
809
810        // This would fail with simple conversion
811        assert!(hgvs_to_spdi_simple(&hgvs).is_err());
812
813        // But works with reference data
814        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
815        assert_eq!(spdi.sequence, "NC_000001.11");
816        // Dup is insertion after the region
817        assert_eq!(spdi.position, 101); // end position 0-based
818        assert_eq!(spdi.deletion, "");
819        assert_eq!(spdi.insertion, "ATG");
820    }
821
822    #[test]
823    fn test_long_deletion_with_reference() {
824        let reference = MockGenomicRef::new();
825        let hgvs = parse_hgvs("NC_000001.11:g.1000_1009del").unwrap();
826
827        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
828        assert_eq!(spdi.deletion, "AAACCCGGGT");
829        assert_eq!(spdi.deletion.len(), 10);
830    }
831
832    #[test]
833    fn test_reference_missing_region() {
834        let reference = MockGenomicRef::new();
835        // Position not in mock reference
836        let hgvs = parse_hgvs("NC_000001.11:g.50000_50002del").unwrap();
837
838        let result = hgvs_to_spdi_with_ref(&hgvs, &reference);
839        assert!(matches!(
840            result,
841            Err(ConversionError::MissingReferenceData { .. })
842        ));
843    }
844
845    #[test]
846    fn test_substitution_falls_through_to_simple() {
847        let reference = MockGenomicRef::new();
848        let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
849
850        // Substitution doesn't need reference, falls through
851        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
852        assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
853    }
854
855    #[test]
856    fn test_mnv_delins_with_reference() {
857        // Multi-nucleotide variant (MNV) as delins
858        let reference = MockGenomicRef::new();
859        let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsGGG").unwrap();
860
861        let spdi = hgvs_to_spdi_with_ref(&hgvs, &reference).unwrap();
862        assert_eq!(spdi.deletion, "ATG");
863        assert_eq!(spdi.insertion, "GGG");
864
865        // MNV same length should roundtrip
866        let back = spdi_to_hgvs(&spdi).unwrap();
867        assert_eq!(back.to_string(), "NC_000001.11:g.100_102delinsGGG");
868    }
869
870    // =========================================================================
871    // P3: SPDI edge case tests
872    // =========================================================================
873
874    #[test]
875    fn test_spdi_empty_deletion_insertion() {
876        // Pure insertion has empty deletion
877        let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
878        assert!(spdi.is_insertion());
879        assert!(!spdi.is_deletion());
880        assert!(!spdi.is_identity());
881        assert_eq!(spdi.deletion, "");
882        assert_eq!(spdi.insertion, "ATG");
883    }
884
885    #[test]
886    fn test_spdi_empty_insertion_deletion() {
887        // Pure deletion has empty insertion
888        let spdi = SpdiVariant::deletion("NC_000001.11", 100, "ATG");
889        assert!(spdi.is_deletion());
890        assert!(!spdi.is_insertion());
891        assert!(!spdi.is_identity());
892        assert_eq!(spdi.deletion, "ATG");
893        assert_eq!(spdi.insertion, "");
894    }
895
896    #[test]
897    fn test_spdi_both_empty_is_identity() {
898        // Both empty is identity at position (unusual but valid)
899        let spdi = SpdiVariant::new("NC_000001.11", 100, "", "");
900        assert!(spdi.is_identity());
901        assert!(!spdi.is_insertion());
902        assert!(!spdi.is_deletion());
903    }
904
905    #[test]
906    fn test_spdi_single_base_insertion() {
907        let spdi = SpdiVariant::insertion("NC_000001.11", 100, "A");
908        assert!(spdi.is_insertion());
909        assert_eq!(spdi.insertion.len(), 1);
910
911        let hgvs = spdi_to_hgvs(&spdi).unwrap();
912        assert!(hgvs.to_string().contains("ins"));
913    }
914
915    #[test]
916    fn test_spdi_single_base_deletion() {
917        let spdi = SpdiVariant::deletion("NC_000001.11", 100, "A");
918        assert!(spdi.is_deletion());
919        assert_eq!(spdi.deletion.len(), 1);
920
921        let hgvs = spdi_to_hgvs(&spdi).unwrap();
922        assert!(hgvs.to_string().contains("del"));
923    }
924
925    #[test]
926    fn test_spdi_long_insertion_100bp() {
927        // Test 100bp insertion (common structural variant size)
928        let long_seq = "A".repeat(100);
929        let spdi = SpdiVariant::insertion("NC_000001.11", 12345, &long_seq);
930
931        assert!(spdi.is_insertion());
932        assert_eq!(spdi.insertion.len(), 100);
933
934        let hgvs = spdi_to_hgvs(&spdi).unwrap();
935        assert!(hgvs.to_string().contains("ins"));
936        // Verify the sequence is preserved
937        assert!(hgvs.to_string().ends_with(&format!("ins{}", long_seq)));
938    }
939
940    #[test]
941    fn test_spdi_long_deletion_100bp() {
942        // Test 100bp deletion
943        let long_seq = "ACGT".repeat(25); // 100bp
944        let spdi = SpdiVariant::deletion("NC_000001.11", 12345, &long_seq);
945
946        assert!(spdi.is_deletion());
947        assert_eq!(spdi.deletion.len(), 100);
948
949        let hgvs = spdi_to_hgvs(&spdi).unwrap();
950        assert!(hgvs.to_string().contains("del"));
951    }
952
953    #[test]
954    fn test_spdi_long_indel_asymmetric() {
955        // Delete 50bp, insert 150bp (net +100bp)
956        let del_seq = "A".repeat(50);
957        let ins_seq = "G".repeat(150);
958        let spdi = SpdiVariant::delins("NC_000001.11", 12345, &del_seq, &ins_seq);
959
960        assert!(!spdi.is_insertion());
961        assert!(!spdi.is_deletion());
962        assert_eq!(spdi.deletion.len(), 50);
963        assert_eq!(spdi.insertion.len(), 150);
964
965        let hgvs = spdi_to_hgvs(&spdi).unwrap();
966        assert!(hgvs.to_string().contains("delins"));
967    }
968
969    #[test]
970    fn test_spdi_very_long_insertion_1000bp() {
971        // Test 1000bp insertion (larger structural variant)
972        let long_seq = "ACGT".repeat(250); // 1000bp
973        let spdi = SpdiVariant::insertion("NC_000001.11", 50000, &long_seq);
974
975        assert!(spdi.is_insertion());
976        assert_eq!(spdi.insertion.len(), 1000);
977
978        // Should still convert to HGVS
979        let hgvs = spdi_to_hgvs(&spdi).unwrap();
980        assert!(hgvs.to_string().contains("ins"));
981    }
982
983    #[test]
984    fn test_spdi_position_zero() {
985        // Position 0 is valid in SPDI (0-based)
986        let spdi = SpdiVariant::new("NC_000001.11", 0, "A", "G");
987        assert_eq!(spdi.position, 0);
988
989        let hgvs = spdi_to_hgvs(&spdi).unwrap();
990        // HGVS position should be 1 (1-based)
991        assert!(hgvs.to_string().contains("g.1A>G"));
992    }
993
994    #[test]
995    fn test_spdi_position_max() {
996        // Very large position (near chromosome end)
997        let spdi = SpdiVariant::new("NC_000001.11", 248956421, "A", "G");
998
999        let hgvs = spdi_to_hgvs(&spdi).unwrap();
1000        assert!(hgvs.to_string().contains("248956422")); // 1-based
1001    }
1002
1003    #[test]
1004    fn test_spdi_lowercase_sequence() {
1005        // SPDI should handle lowercase (though uppercase is standard)
1006        let spdi = SpdiVariant::new("NC_000001.11", 100, "a", "g");
1007
1008        // The variant should work even with lowercase
1009        assert_eq!(spdi.deletion, "a");
1010        assert_eq!(spdi.insertion, "g");
1011    }
1012
1013    #[test]
1014    fn test_spdi_mixed_case_sequence() {
1015        // Mixed case sequence
1016        let spdi = SpdiVariant::new("NC_000001.11", 100, "AtGc", "GcTa");
1017
1018        assert_eq!(spdi.deletion, "AtGc");
1019        assert_eq!(spdi.insertion, "GcTa");
1020    }
1021
1022    #[test]
1023    fn test_spdi_n_bases_in_sequence() {
1024        // N (unknown) bases in sequence
1025        let spdi = SpdiVariant::new("NC_000001.11", 100, "ANG", "TNC");
1026
1027        assert_eq!(spdi.deletion, "ANG");
1028        assert_eq!(spdi.insertion, "TNC");
1029    }
1030
1031    #[test]
1032    fn test_spdi_complex_repeat_sequence() {
1033        // Repeat sequence (e.g., microsatellite)
1034        let repeat = "CAG".repeat(30); // 90bp CAG repeat
1035        let spdi = SpdiVariant::insertion("NC_000004.12", 3074876, &repeat);
1036
1037        assert!(spdi.is_insertion());
1038        assert_eq!(spdi.insertion.len(), 90);
1039    }
1040
1041    #[test]
1042    fn test_spdi_to_hgvs_delins_single_base_del() {
1043        // Single base deletion with multi-base insertion
1044        let spdi = SpdiVariant::delins("NC_000001.11", 100, "A", "TTTT");
1045
1046        let hgvs = spdi_to_hgvs(&spdi).unwrap();
1047        // Single base del + ins = delins at single position
1048        assert!(hgvs.to_string().contains("delinsTTTT"));
1049    }
1050
1051    #[test]
1052    fn test_spdi_to_hgvs_delins_single_base_ins() {
1053        // Multi-base deletion with single base insertion
1054        let spdi = SpdiVariant::delins("NC_000001.11", 100, "AAAA", "T");
1055
1056        let hgvs = spdi_to_hgvs(&spdi).unwrap();
1057        assert!(hgvs.to_string().contains("delinsT"));
1058    }
1059
1060    #[test]
1061    fn test_spdi_different_chromosome_formats() {
1062        // Various accession formats should work
1063        let test_cases = vec![
1064            ("NC_000001.11", "NC_000001.11"), // Standard RefSeq
1065            ("NC_000023.11", "NC_000023.11"), // X chromosome
1066            ("NC_000024.10", "NC_000024.10"), // Y chromosome
1067            ("NC_012920.1", "NC_012920.1"),   // Mitochondrial
1068        ];
1069
1070        for (input_acc, expected_acc) in test_cases {
1071            let spdi = SpdiVariant::new(input_acc, 100, "A", "G");
1072            assert_eq!(spdi.sequence, expected_acc);
1073
1074            let hgvs = spdi_to_hgvs(&spdi).unwrap();
1075            assert!(hgvs.to_string().starts_with(expected_acc));
1076        }
1077    }
1078
1079    #[test]
1080    fn test_spdi_roundtrip_preserves_case_normalized() {
1081        // Create SPDI substitution (SNV) which roundtrips cleanly
1082        let spdi = SpdiVariant::new("NC_000001.11", 100, "A", "G");
1083        let hgvs = spdi_to_hgvs(&spdi).unwrap();
1084        let back = hgvs_to_spdi_simple(&hgvs).unwrap();
1085
1086        // Should preserve the sequence
1087        assert_eq!(back.deletion, "A");
1088        assert_eq!(back.insertion, "G");
1089
1090        // Test multi-base delins - without reference data, the roundtrip
1091        // cannot reconstruct the deletion sequence and returns an error
1092        let spdi_delins = SpdiVariant::new("NC_000001.11", 100, "ACGT", "TGCA");
1093        let hgvs_delins = spdi_to_hgvs(&spdi_delins).unwrap();
1094        let back_delins = hgvs_to_spdi_simple(&hgvs_delins);
1095
1096        // Without reference, this should fail since the deletion sequence is unknown
1097        assert!(back_delins.is_err());
1098    }
1099
1100    #[test]
1101    fn test_spdi_empty_seq_insertion_roundtrip() {
1102        // Empty deletion (pure insertion) roundtrip
1103        let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
1104        let hgvs = spdi_to_hgvs(&spdi).unwrap();
1105
1106        // HGVS format for insertion is pos_pos+1insX
1107        assert!(hgvs.to_string().contains("ins"));
1108
1109        // Roundtrip back
1110        let back = hgvs_to_spdi_simple(&hgvs).unwrap();
1111        assert_eq!(back.deletion, "");
1112        assert_eq!(back.insertion, "ATG");
1113    }
1114
1115    #[test]
1116    fn test_hgvs_to_spdi_various_accession_types() {
1117        // Test conversion from various accession types
1118        let test_variants = vec![
1119            "NC_000001.11:g.12345A>G", // Chromosome
1120            "NC_000023.11:g.12345A>G", // X chromosome
1121            "NC_012920.1:g.12345A>G",  // Mitochondrial
1122        ];
1123
1124        for variant_str in test_variants {
1125            let hgvs = parse_hgvs(variant_str).unwrap();
1126            let spdi = hgvs_to_spdi_simple(&hgvs);
1127            assert!(spdi.is_ok(), "Failed for: {}", variant_str);
1128        }
1129    }
1130
1131    #[test]
1132    fn test_spdi_display_format() {
1133        let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
1134        assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
1135
1136        let spdi_del = SpdiVariant::deletion("NC_000001.11", 100, "ATG");
1137        assert_eq!(spdi_del.to_string(), "NC_000001.11:100:ATG:");
1138
1139        let spdi_ins = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
1140        assert_eq!(spdi_ins.to_string(), "NC_000001.11:100::ATG");
1141    }
1142
1143    #[test]
1144    fn test_spdi_identity_various_lengths() {
1145        // Single base identity
1146        let spdi1 = SpdiVariant::new("NC_000001.11", 100, "A", "A");
1147        assert!(spdi1.is_identity());
1148
1149        // Multi-base identity (unusual but valid)
1150        let spdi2 = SpdiVariant::new("NC_000001.11", 100, "ATG", "ATG");
1151        assert!(spdi2.is_identity());
1152
1153        // Empty identity
1154        let spdi3 = SpdiVariant::new("NC_000001.11", 100, "", "");
1155        assert!(spdi3.is_identity());
1156    }
1157}