Skip to main content

holodeck_lib/
fragment.rs

1//! Fragment extraction and read base generation.
2//!
3//! Handles extracting genomic fragments from haplotypes, reverse-complement
4//! operations, adapter sequence appending for short fragments, and R1/R2
5//! base extraction from fragments.
6
7use crate::haplotype::Haplotype;
8
9/// The DNA complement of a base. Handles both upper and lowercase input,
10/// always returning uppercase output.
11#[must_use]
12fn complement(base: u8) -> u8 {
13    match base.to_ascii_uppercase() {
14        b'A' => b'T',
15        b'T' => b'A',
16        b'C' => b'G',
17        b'G' => b'C',
18        _ => b'N',
19    }
20}
21
22/// Reverse-complement a DNA sequence in place.
23pub fn reverse_complement(seq: &mut [u8]) {
24    seq.reverse();
25    for base in seq.iter_mut() {
26        *base = complement(*base);
27    }
28}
29
30/// A simulated fragment extracted from a haplotype.
31///
32/// Contains the fragment bases and reference coordinate mapping, plus
33/// metadata about the source haplotype and strand.
34#[derive(Debug)]
35pub struct Fragment {
36    /// Fragment bases in the forward orientation (5' to 3').
37    pub bases: Vec<u8>,
38    /// Reference position for each base in the fragment.
39    pub ref_positions: Vec<u32>,
40    /// 0-based reference start position of the fragment.
41    pub ref_start: u32,
42    /// Whether the fragment is on the forward strand.
43    pub is_forward: bool,
44    /// Index of the haplotype this fragment came from.
45    pub haplotype_index: usize,
46}
47
48/// Extract a fragment from a haplotype at a given position and strand.
49///
50/// The fragment is extracted in the forward orientation from the haplotype,
51/// then reverse-complemented if on the reverse strand.
52///
53/// # Arguments
54/// * `haplotype` — The haplotype to extract from.
55/// * `reference` — The full reference sequence for this contig.
56/// * `ref_start` — 0-based start position on the reference.
57/// * `fragment_len` — Desired fragment length in bases.
58/// * `is_forward` — Whether to read the forward or reverse strand.
59#[must_use]
60pub fn extract_fragment(
61    haplotype: &Haplotype,
62    reference: &[u8],
63    ref_start: u32,
64    fragment_len: usize,
65    is_forward: bool,
66) -> Fragment {
67    let (bases, ref_positions) = haplotype.extract_fragment(reference, ref_start, fragment_len);
68
69    Fragment {
70        bases,
71        ref_positions,
72        ref_start,
73        is_forward,
74        haplotype_index: haplotype.allele_index(),
75    }
76}
77
78/// Extract read bases from a fragment, handling adapter sequences for short
79/// fragments.
80///
81/// For R1: takes the first `read_length` bases from the fragment. If the
82/// fragment is shorter than `read_length`, appends adapter bases.
83///
84/// For R2: takes the last `read_length` bases from the fragment and
85/// reverse-complements them. If the fragment is shorter than `read_length`,
86/// appends adapter bases after the reverse-complemented fragment.
87///
88/// # Arguments
89/// * `fragment` — The source fragment bases.
90/// * `read_length` — Desired read length.
91/// * `adapter` — Adapter sequence to append for short fragments.
92/// * `is_r2` — If true, extract from the end and reverse complement.
93#[must_use]
94pub fn extract_read_bases(
95    fragment_bases: &[u8],
96    read_length: usize,
97    adapter: &[u8],
98    is_r2: bool,
99) -> Vec<u8> {
100    let frag_len = fragment_bases.len();
101
102    if is_r2 {
103        // R2: last `read_length` bases from fragment, reverse-complemented,
104        // then adapter.
105        let mut bases = Vec::with_capacity(read_length);
106        let take = frag_len.min(read_length);
107        let start = frag_len.saturating_sub(take);
108        bases.extend_from_slice(&fragment_bases[start..]);
109        reverse_complement(&mut bases);
110        append_adapter_and_pad(&mut bases, read_length, adapter);
111        bases
112    } else {
113        // R1: first `read_length` bases from fragment, then adapter.
114        let mut bases = Vec::with_capacity(read_length);
115        let take = frag_len.min(read_length);
116        bases.extend_from_slice(&fragment_bases[..take]);
117        append_adapter_and_pad(&mut bases, read_length, adapter);
118        bases
119    }
120}
121
122/// Append adapter bases and pad with N to reach `target_len`.
123fn append_adapter_and_pad(bases: &mut Vec<u8>, target_len: usize, adapter: &[u8]) {
124    if bases.len() < target_len {
125        let need = target_len - bases.len();
126        let adapter_take = need.min(adapter.len());
127        bases.extend_from_slice(&adapter[..adapter_take]);
128        // Pad with N if adapter is also too short.
129        while bases.len() < target_len {
130            bases.push(b'N');
131        }
132    }
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn test_reverse_complement() {
141        let mut seq = b"ACGT".to_vec();
142        reverse_complement(&mut seq);
143        assert_eq!(&seq, b"ACGT"); // palindrome
144
145        let mut seq2 = b"AACG".to_vec();
146        reverse_complement(&mut seq2);
147        assert_eq!(&seq2, b"CGTT");
148    }
149
150    #[test]
151    fn test_reverse_complement_with_n() {
152        let mut seq = b"ANGC".to_vec();
153        reverse_complement(&mut seq);
154        assert_eq!(&seq, b"GCNT");
155    }
156
157    #[test]
158    fn test_extract_r1_full_fragment() {
159        let fragment = b"ACGTACGTAC";
160        let bases = extract_read_bases(fragment, 10, b"ADAPTER", false);
161        assert_eq!(&bases, b"ACGTACGTAC");
162    }
163
164    #[test]
165    fn test_extract_r1_fragment_longer_than_read() {
166        let fragment = b"ACGTACGTAC";
167        let bases = extract_read_bases(fragment, 5, b"ADAPTER", false);
168        assert_eq!(&bases, b"ACGTA");
169    }
170
171    #[test]
172    fn test_extract_r1_short_fragment_with_adapter() {
173        let fragment = b"ACG";
174        let adapter = b"TTTTTT";
175        let bases = extract_read_bases(fragment, 8, adapter, false);
176        assert_eq!(&bases, b"ACGTTTTT");
177    }
178
179    #[test]
180    fn test_extract_r2_full_fragment() {
181        let fragment = b"ACGTACGTAC";
182        let bases = extract_read_bases(fragment, 10, b"ADAPTER", true);
183        // Reverse complement of ACGTACGTAC is GTACGTACGT
184        assert_eq!(&bases, b"GTACGTACGT");
185    }
186
187    #[test]
188    fn test_extract_r2_fragment_longer_than_read() {
189        let fragment = b"ACGTACGTAC";
190        let bases = extract_read_bases(fragment, 5, b"ADAPTER", true);
191        // Last 5 bases: CGTAC, reverse complement: GTACG
192        assert_eq!(&bases, b"GTACG");
193    }
194
195    #[test]
196    fn test_extract_r2_short_fragment_with_adapter() {
197        let fragment = b"ACG";
198        let adapter = b"TTTTTT";
199        let bases = extract_read_bases(fragment, 8, adapter, true);
200        // Reverse complement of ACG is CGT (3 bytes), then 5 adapter T's.
201        assert_eq!(&bases, b"CGTTTTTT"); // CGT + TTTTT = 8 bytes
202        assert_eq!(bases.len(), 8);
203    }
204
205    #[test]
206    fn test_extract_empty_fragment_all_adapter() {
207        let fragment = b"";
208        let adapter = b"AGATCGG";
209        let bases = extract_read_bases(fragment, 5, b"AGATCGG", false);
210        assert_eq!(&bases, b"AGATC");
211
212        let bases_r2 = extract_read_bases(fragment, 5, adapter, true);
213        assert_eq!(&bases_r2, b"AGATC");
214    }
215}