Skip to main content

holodeck_lib/
fragment.rs

1//! Fragment extraction and read base generation.
2//!
3//! Handles extracting genomic fragments from haplotypes, reverse-complement
4//! operations, adapter sequence appending for short fragments, and R1/R2
5//! base extraction from fragments.
6//!
7//! # Lowercase-as-marker convention
8//!
9//! [`crate::fasta::Fasta::load_contig`] stores bases resolved from IUPAC
10//! ambiguity codes (including `N`) in lowercase, while real `A`/`C`/`G`/`T`
11//! bases remain uppercase. Reads that carry those synthesized bases through
12//! from the reference can be detected by counting lowercase bytes via
13//! [`lowercase_fraction`], and the buffers are upper-cased in place with
14//! [`uppercase_in_place`] before being emitted.
15
16use crate::haplotype::Haplotype;
17
18/// The DNA complement of a base. Preserves the case of the input so the
19/// lowercase "synthesized from ambiguity" marker (see module docs) survives
20/// reverse-complementing for R2 reads.
21#[must_use]
22fn complement(base: u8) -> u8 {
23    let upper = match base.to_ascii_uppercase() {
24        b'A' => b'T',
25        b'T' => b'A',
26        b'C' => b'G',
27        b'G' => b'C',
28        _ => b'N',
29    };
30    if base.is_ascii_lowercase() { upper.to_ascii_lowercase() } else { upper }
31}
32
33/// Reverse-complement a DNA sequence in place.
34pub fn reverse_complement(seq: &mut [u8]) {
35    seq.reverse();
36    for base in seq.iter_mut() {
37        *base = complement(*base);
38    }
39}
40
41/// A simulated fragment extracted from a haplotype.
42///
43/// Contains the fragment bases and reference coordinate mapping, plus
44/// metadata about the source haplotype and strand.
45#[derive(Debug)]
46pub struct Fragment {
47    /// Fragment bases in the forward orientation (5' to 3').
48    pub bases: Vec<u8>,
49    /// Reference position for each base in the fragment.
50    pub ref_positions: Vec<u32>,
51    /// 0-based reference start position of the fragment.
52    pub ref_start: u32,
53    /// Whether the fragment is on the forward strand.
54    pub is_forward: bool,
55    /// Index of the haplotype this fragment came from.
56    pub haplotype_index: usize,
57}
58
59/// Extract a fragment from a haplotype at a given position and strand.
60///
61/// The fragment is extracted in the forward orientation from the haplotype,
62/// then reverse-complemented if on the reverse strand.
63///
64/// # Arguments
65/// * `haplotype` — The haplotype to extract from.
66/// * `reference` — The full reference sequence for this contig.
67/// * `ref_start` — 0-based start position on the reference.
68/// * `fragment_len` — Desired fragment length in bases.
69/// * `is_forward` — Whether to read the forward or reverse strand.
70#[must_use]
71pub fn extract_fragment(
72    haplotype: &Haplotype,
73    reference: &[u8],
74    ref_start: u32,
75    fragment_len: usize,
76    is_forward: bool,
77) -> Fragment {
78    let (bases, ref_positions) = haplotype.extract_fragment(reference, ref_start, fragment_len);
79
80    Fragment {
81        bases,
82        ref_positions,
83        ref_start,
84        is_forward,
85        haplotype_index: haplotype.allele_index(),
86    }
87}
88
89/// Extract read bases from a fragment, handling adapter sequences for short
90/// fragments.
91///
92/// For R1: takes the first `read_length` bases from the fragment. If the
93/// fragment is shorter than `read_length`, appends adapter bases.
94///
95/// For R2: takes the last `read_length` bases from the fragment and
96/// reverse-complements them. If the fragment is shorter than `read_length`,
97/// appends adapter bases after the reverse-complemented fragment.
98///
99/// # Arguments
100/// * `fragment` — The source fragment bases.
101/// * `read_length` — Desired read length.
102/// * `adapter` — Adapter sequence to append for short fragments.
103/// * `is_r2` — If true, extract from the end and reverse complement.
104#[must_use]
105pub fn extract_read_bases(
106    fragment_bases: &[u8],
107    read_length: usize,
108    adapter: &[u8],
109    is_r2: bool,
110) -> Vec<u8> {
111    let frag_len = fragment_bases.len();
112
113    if is_r2 {
114        // R2: last `read_length` bases from fragment, reverse-complemented,
115        // then adapter.
116        let mut bases = Vec::with_capacity(read_length);
117        let take = frag_len.min(read_length);
118        let start = frag_len.saturating_sub(take);
119        bases.extend_from_slice(&fragment_bases[start..]);
120        reverse_complement(&mut bases);
121        append_adapter_and_pad(&mut bases, read_length, adapter);
122        bases
123    } else {
124        // R1: first `read_length` bases from fragment, then adapter.
125        let mut bases = Vec::with_capacity(read_length);
126        let take = frag_len.min(read_length);
127        bases.extend_from_slice(&fragment_bases[..take]);
128        append_adapter_and_pad(&mut bases, read_length, adapter);
129        bases
130    }
131}
132
133/// Fraction of lowercase ASCII bytes in `bases`.
134///
135/// Lowercase bases mark positions that were synthesized from IUPAC
136/// ambiguity codes (including `N`) at reference-load time, so this is a
137/// cheap proxy for "how much of this read came from an ambiguous reference
138/// region." Returns `0.0` for an empty slice.
139#[must_use]
140pub fn lowercase_fraction(bases: &[u8]) -> f64 {
141    if bases.is_empty() {
142        return 0.0;
143    }
144    // The ASCII 0x20 bit distinguishes lowercase from uppercase letters; any
145    // non-alphabetic byte (e.g. adapter pad) has this bit clear for our
146    // valid base values (A-Z / a-z) — so counting that bit is equivalent to
147    // counting lowercase letters.
148    let lower = bases.iter().filter(|&&b| b.is_ascii_lowercase()).count();
149    lower as f64 / bases.len() as f64
150}
151
152/// Upper-case every ASCII letter in `bases` in place.
153pub fn uppercase_in_place(bases: &mut [u8]) {
154    for b in bases.iter_mut() {
155        b.make_ascii_uppercase();
156    }
157}
158
159/// Append adapter bases and pad with N to reach `target_len`.
160fn append_adapter_and_pad(bases: &mut Vec<u8>, target_len: usize, adapter: &[u8]) {
161    if bases.len() < target_len {
162        let need = target_len - bases.len();
163        let adapter_take = need.min(adapter.len());
164        bases.extend_from_slice(&adapter[..adapter_take]);
165        // Pad with N if adapter is also too short.
166        while bases.len() < target_len {
167            bases.push(b'N');
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175
176    #[test]
177    fn test_reverse_complement() {
178        let mut seq = b"ACGT".to_vec();
179        reverse_complement(&mut seq);
180        assert_eq!(&seq, b"ACGT"); // palindrome
181
182        let mut seq2 = b"AACG".to_vec();
183        reverse_complement(&mut seq2);
184        assert_eq!(&seq2, b"CGTT");
185    }
186
187    #[test]
188    fn test_reverse_complement_with_n() {
189        let mut seq = b"ANGC".to_vec();
190        reverse_complement(&mut seq);
191        assert_eq!(&seq, b"GCNT");
192    }
193
194    #[test]
195    fn test_reverse_complement_preserves_lowercase() {
196        // Lowercase bases carry an ambiguity-resolved marker that must
197        // survive reverse-complementing.
198        let mut seq = b"aCgT".to_vec();
199        reverse_complement(&mut seq);
200        assert_eq!(&seq, b"AcGt");
201    }
202
203    #[test]
204    fn test_extract_r1_full_fragment() {
205        let fragment = b"ACGTACGTAC";
206        let bases = extract_read_bases(fragment, 10, b"ADAPTER", false);
207        assert_eq!(&bases, b"ACGTACGTAC");
208    }
209
210    #[test]
211    fn test_extract_r1_fragment_longer_than_read() {
212        let fragment = b"ACGTACGTAC";
213        let bases = extract_read_bases(fragment, 5, b"ADAPTER", false);
214        assert_eq!(&bases, b"ACGTA");
215    }
216
217    #[test]
218    fn test_extract_r1_short_fragment_with_adapter() {
219        let fragment = b"ACG";
220        let adapter = b"TTTTTT";
221        let bases = extract_read_bases(fragment, 8, adapter, false);
222        assert_eq!(&bases, b"ACGTTTTT");
223    }
224
225    #[test]
226    fn test_extract_r2_full_fragment() {
227        let fragment = b"ACGTACGTAC";
228        let bases = extract_read_bases(fragment, 10, b"ADAPTER", true);
229        // Reverse complement of ACGTACGTAC is GTACGTACGT
230        assert_eq!(&bases, b"GTACGTACGT");
231    }
232
233    #[test]
234    fn test_extract_r2_fragment_longer_than_read() {
235        let fragment = b"ACGTACGTAC";
236        let bases = extract_read_bases(fragment, 5, b"ADAPTER", true);
237        // Last 5 bases: CGTAC, reverse complement: GTACG
238        assert_eq!(&bases, b"GTACG");
239    }
240
241    #[test]
242    fn test_extract_r2_short_fragment_with_adapter() {
243        let fragment = b"ACG";
244        let adapter = b"TTTTTT";
245        let bases = extract_read_bases(fragment, 8, adapter, true);
246        // Reverse complement of ACG is CGT (3 bytes), then 5 adapter T's.
247        assert_eq!(&bases, b"CGTTTTTT"); // CGT + TTTTT = 8 bytes
248        assert_eq!(bases.len(), 8);
249    }
250
251    #[test]
252    fn test_lowercase_fraction_empty() {
253        assert!(lowercase_fraction(b"").abs() < 1e-12);
254    }
255
256    #[test]
257    fn test_lowercase_fraction_all_upper() {
258        assert!(lowercase_fraction(b"ACGTACGT").abs() < 1e-12);
259    }
260
261    #[test]
262    fn test_lowercase_fraction_all_lower() {
263        assert!((lowercase_fraction(b"acgtacgt") - 1.0).abs() < 1e-12);
264    }
265
266    #[test]
267    fn test_lowercase_fraction_mixed() {
268        // 3 lowercase out of 10.
269        let f = lowercase_fraction(b"ACaGcTAtCA");
270        assert!((f - 0.3).abs() < 1e-10, "expected 0.3, got {f}");
271    }
272
273    #[test]
274    fn test_lowercase_fraction_ignores_non_letters() {
275        // N-pad and '-' are non-letter / uppercase — only 'a' should count.
276        // "A-Na" is length 4 with 1 lowercase letter.
277        let f = lowercase_fraction(b"A-Na");
278        assert!((f - 0.25).abs() < 1e-10, "expected 0.25, got {f}");
279    }
280
281    #[test]
282    fn test_uppercase_in_place() {
283        let mut bases = b"aCgTnN".to_vec();
284        uppercase_in_place(&mut bases);
285        assert_eq!(&bases, b"ACGTNN");
286    }
287
288    #[test]
289    fn test_uppercase_in_place_empty() {
290        let mut bases: Vec<u8> = Vec::new();
291        uppercase_in_place(&mut bases);
292        assert!(bases.is_empty());
293    }
294
295    #[test]
296    fn test_extract_empty_fragment_all_adapter() {
297        let fragment = b"";
298        let adapter = b"AGATCGG";
299        let bases = extract_read_bases(fragment, 5, b"AGATCGG", false);
300        assert_eq!(&bases, b"AGATC");
301
302        let bases_r2 = extract_read_bases(fragment, 5, adapter, true);
303        assert_eq!(&bases_r2, b"AGATC");
304    }
305}