fgumi 0.1.3

High-performance tools for UMI-tagged sequencing data: extraction, grouping, and consensus calling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
//! BAM record encoder.

mod bin;
mod cigar;
pub mod data;
mod flags;
mod mapping_quality;
mod name;
pub(crate) mod num;
mod position;
mod quality_scores;
mod reference_sequence_id;
mod sequence;
mod template_length;

use std::{error, fmt, io};

use noodles::sam::{
    self as sam,
    alignment::{Record, RecordBuf},
};

pub use self::cigar::write_cigar;
use self::{
    bin::write_bin,
    cigar::overflowing_write_cigar_op_count,
    cigar::write_cigar_from_slice,
    data::write_data,
    data::write_data_fast,
    flags::write_flags,
    mapping_quality::write_mapping_quality,
    name::write_name,
    position::write_position,
    quality_scores::{write_quality_scores, write_quality_scores_from_slice},
    reference_sequence_id::write_reference_sequence_id,
    sequence::{write_sequence, write_sequence_from_slice},
    template_length::write_template_length,
};

/// An error when a BAM record fails to encode.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum EncodeError {
    /// The reference sequence ID is invalid.
    InvalidReferenceSequenceId(reference_sequence_id::EncodeError),
    /// The alignment start is invalid.
    InvalidAlignmentStart(position::EncodeError),
    /// The mate reference sequence ID is invalid.
    InvalidMateReferenceSequenceId(reference_sequence_id::EncodeError),
    /// The mate alignment start is invalid.
    InvalidMateAlignmentStart(position::EncodeError),
}

impl error::Error for EncodeError {
    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
        #[allow(clippy::match_same_arms)]
        match self {
            Self::InvalidReferenceSequenceId(e) => Some(e),
            Self::InvalidAlignmentStart(e) => Some(e),
            Self::InvalidMateReferenceSequenceId(e) => Some(e),
            Self::InvalidMateAlignmentStart(e) => Some(e),
        }
    }
}

impl fmt::Display for EncodeError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::InvalidReferenceSequenceId(_) => write!(f, "invalid reference sequence ID"),
            Self::InvalidAlignmentStart(_) => write!(f, "invalid alignment start"),
            Self::InvalidMateReferenceSequenceId(_) => {
                write!(f, "invalid mate reference sequence ID")
            }
            Self::InvalidMateAlignmentStart(_) => write!(f, "invalid mate alignment start"),
        }
    }
}

/// Fast estimation of BAM record encoded size.
///
/// Uses a lightweight heuristic based on sequence length to avoid
/// expensive iteration over CIGAR and data fields. Estimates are
/// intentionally generous to avoid underestimation.
#[inline]
fn estimate_record_size_fast<R>(record: &R) -> usize
where
    R: Record + ?Sized,
{
    // Fixed header: 32 bytes
    // Name: ~32 bytes typical (generous estimate)
    // CIGAR: ~16 bytes typical (4 ops * 4 bytes)
    const FIXED_OVERHEAD: usize = 32 + 32 + 16;

    // Main variable components scale with sequence length:
    // - Packed sequence: (seq_len + 1) / 2
    // - Quality scores: seq_len
    // - Aux data: ~64 bytes typical (generous for common tags)
    let seq_len = record.sequence().len();

    // seq_len + seq_len.div_ceil(2) ≈ 1.5 * seq_len, plus fixed overhead and aux data padding
    FIXED_OVERHEAD + seq_len + seq_len.div_ceil(2) + 64
}

/// Encode a BAM record to raw bytes with buffer pre-allocation.
///
/// This is an optimized version of [`encode`] that estimates the record size
/// and reserves buffer capacity before encoding, reducing reallocations.
/// Only reserves additional capacity if needed.
#[inline]
pub fn encode_with_prealloc<R>(
    dst: &mut Vec<u8>,
    header: &sam::Header,
    record: &R,
) -> io::Result<()>
where
    R: Record + ?Sized,
{
    let estimated_size = estimate_record_size_fast(record);
    let current_len = dst.len();
    let available = dst.capacity() - current_len;

    // Only reserve if we don't have enough capacity
    if available < estimated_size {
        dst.reserve(estimated_size - available);
    }

    encode(dst, header, record)
}

/// Optimized encoder specifically for [`RecordBuf`].
///
/// This bypasses trait-based iterators and works directly with the underlying
/// data slices for improved performance:
/// - Sequence: direct slice encoding instead of per-base iterator
/// - Quality scores: bulk memcpy instead of per-byte iteration
///
/// Use this when encoding `RecordBuf` instances for best performance.
#[inline]
pub fn encode_record_buf(
    dst: &mut Vec<u8>,
    header: &sam::Header,
    record: &RecordBuf,
) -> io::Result<()> {
    // Pre-allocate buffer
    let estimated_size = estimate_record_size_fast(record);
    let current_len = dst.len();
    let available = dst.capacity() - current_len;
    if available < estimated_size {
        dst.reserve(estimated_size - available);
    }

    // ref_id
    let reference_sequence_id = record.reference_sequence_id();
    write_reference_sequence_id(dst, header, reference_sequence_id)
        .map_err(EncodeError::InvalidReferenceSequenceId)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // pos
    let alignment_start = record.alignment_start();
    write_position(dst, alignment_start)
        .map_err(EncodeError::InvalidAlignmentStart)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    name::write_length(dst, record.name())?;

    // mapq
    let mapping_quality = record.mapping_quality();
    write_mapping_quality(dst, mapping_quality);

    // bin
    let alignment_end = record.alignment_end();
    write_bin(dst, alignment_start, alignment_end);

    // n_cigar_op
    let base_count = record.sequence().len();
    let cigar = overflowing_write_cigar_op_count(dst, base_count, record.cigar())?;

    // flag
    let flags = record.flags();
    write_flags(dst, flags);

    sequence::write_length(dst, base_count)?;

    // next_ref_id
    let mate_reference_sequence_id = record.mate_reference_sequence_id();
    write_reference_sequence_id(dst, header, mate_reference_sequence_id)
        .map_err(EncodeError::InvalidMateReferenceSequenceId)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // next_pos
    let mate_alignment_start = record.mate_alignment_start();
    write_position(dst, mate_alignment_start)
        .map_err(EncodeError::InvalidMateAlignmentStart)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // tlen
    let template_length = record.template_length();
    write_template_length(dst, template_length);

    // read_name
    write_name(dst, record.name())?;

    // cigar - use optimized slice-based encoding
    if let Some(placeholder_cigar) = &cigar {
        // Overflowing case: placeholder CIGAR (rare, use iterator path)
        write_cigar(dst, placeholder_cigar)?;
    } else {
        // Normal case: use fast slice-based encoding
        let cigar_ops: &[sam::alignment::record::cigar::Op] = record.cigar().as_ref();
        write_cigar_from_slice(dst, cigar_ops)?;
    }

    // seq - use optimized slice-based encoding
    let seq_bytes: &[u8] = record.sequence().as_ref();
    let read_length = record.cigar().read_length();
    write_sequence_from_slice(dst, read_length, seq_bytes)?;

    // qual - use optimized slice-based encoding
    let qual_scores: &[u8] = record.quality_scores().as_ref();
    write_quality_scores_from_slice(dst, base_count, qual_scores)?;

    // data - use optimized fast path for common tags
    write_data_fast(dst, record.data())?;

    if cigar.is_some() {
        data::field::write_cigar(dst, record.cigar())?;
    }

    Ok(())
}

/// Encode a BAM record to raw bytes.
///
/// This function encodes a BAM record to its raw byte representation,
/// WITHOUT the 4-byte `block_size` prefix. The caller is responsible for
/// adding the `block_size` prefix if needed.
pub fn encode<R>(dst: &mut Vec<u8>, header: &sam::Header, record: &R) -> io::Result<()>
where
    R: Record + ?Sized,
{
    // ref_id
    let reference_sequence_id = record.reference_sequence_id(header).transpose()?;
    write_reference_sequence_id(dst, header, reference_sequence_id)
        .map_err(EncodeError::InvalidReferenceSequenceId)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // pos
    let alignment_start = record.alignment_start().transpose()?;
    write_position(dst, alignment_start)
        .map_err(EncodeError::InvalidAlignmentStart)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    name::write_length(dst, record.name())?;

    // mapq
    let mapping_quality = record.mapping_quality().transpose()?;
    write_mapping_quality(dst, mapping_quality);

    // bin
    let alignment_end = record.alignment_end().transpose()?;
    write_bin(dst, alignment_start, alignment_end);

    // n_cigar_op
    let base_count = record.sequence().len();
    let cigar = overflowing_write_cigar_op_count(dst, base_count, &record.cigar())?;

    // flag
    let flags = record.flags()?;
    write_flags(dst, flags);

    sequence::write_length(dst, base_count)?;

    // next_ref_id
    let mate_reference_sequence_id = record.mate_reference_sequence_id(header).transpose()?;
    write_reference_sequence_id(dst, header, mate_reference_sequence_id)
        .map_err(EncodeError::InvalidMateReferenceSequenceId)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // next_pos
    let mate_alignment_start = record.mate_alignment_start().transpose()?;
    write_position(dst, mate_alignment_start)
        .map_err(EncodeError::InvalidMateAlignmentStart)
        .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;

    // tlen
    let template_length = record.template_length()?;
    write_template_length(dst, template_length);

    // read_name
    write_name(dst, record.name())?;

    if let Some(cigar) = &cigar {
        write_cigar(dst, cigar)?;
    } else {
        write_cigar(dst, &record.cigar())?;
    }

    let sequence = record.sequence();

    // seq
    let read_length = record.cigar().read_length()?;
    write_sequence(dst, read_length, sequence)?;

    // qual
    write_quality_scores(dst, base_count, record.quality_scores())?;

    write_data(dst, record.data())?;

    if cigar.is_some() {
        data::field::write_cigar(dst, &record.cigar())?;
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use std::num::NonZero;

    use noodles::core::Position;
    use noodles::sam::{
        alignment::RecordBuf,
        header::record::value::{Map, map::ReferenceSequence},
    };

    use super::*;

    #[test]
    fn test_encode_with_default_fields() -> Result<(), Box<dyn std::error::Error>> {
        let mut buf = Vec::new();
        let header = sam::Header::default();
        let record = RecordBuf::default();
        encode(&mut buf, &header, &record)?;

        let expected = [
            0xff, 0xff, 0xff, 0xff, // ref_id = -1
            0xff, 0xff, 0xff, 0xff, // pos = -1
            0x02, // l_read_name = 2
            0xff, // mapq = 255
            0x48, 0x12, // bin = 4680
            0x00, 0x00, // n_cigar_op = 0
            0x04, 0x00, // flag = 4
            0x00, 0x00, 0x00, 0x00, // l_seq = 0
            0xff, 0xff, 0xff, 0xff, // next_ref_id = -1
            0xff, 0xff, 0xff, 0xff, // next_pos = -1
            0x00, 0x00, 0x00, 0x00, // tlen = 0
            0x2a, 0x00, // read_name = "*\x00"
        ];

        assert_eq!(buf, expected);

        Ok(())
    }

    #[test]
    fn test_encode_with_all_fields() -> Result<(), Box<dyn std::error::Error>> {
        use sam::alignment::{
            record::{
                Flags, MappingQuality,
                cigar::{Op, op::Kind},
                data::field::Tag,
            },
            record_buf::{QualityScores, Sequence, data::field::Value},
        };

        let mut buf = Vec::new();

        let header = sam::Header::builder()
            .add_reference_sequence(
                "sq0",
                Map::<ReferenceSequence>::new(const { NonZero::new(8).expect("non-zero value 8") }),
            )
            .add_reference_sequence(
                "sq1",
                Map::<ReferenceSequence>::new(
                    const { NonZero::new(13).expect("non-zero value 13") },
                ),
            )
            .build();

        let record = RecordBuf::builder()
            .set_name("r0")
            .set_flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT)
            .set_reference_sequence_id(1)
            .set_alignment_start(Position::try_from(9)?)
            .set_mapping_quality(MappingQuality::try_from(13)?)
            .set_cigar([Op::new(Kind::Match, 3), Op::new(Kind::SoftClip, 1)].into_iter().collect())
            .set_mate_reference_sequence_id(1)
            .set_mate_alignment_start(Position::try_from(22)?)
            .set_template_length(144)
            .set_sequence(Sequence::from(b"ACGT"))
            .set_quality_scores(QualityScores::from(vec![45, 35, 43, 50]))
            .set_data([(Tag::ALIGNMENT_HIT_COUNT, Value::from(1))].into_iter().collect())
            .build();

        encode(&mut buf, &header, &record)?;

        let expected = [
            0x01, 0x00, 0x00, 0x00, // ref_id = 1
            0x08, 0x00, 0x00, 0x00, // pos = 8
            0x03, // l_read_name = 3
            0x0d, // mapq = 13
            0x49, 0x12, // bin = 4681
            0x02, 0x00, // n_cigar_op = 2
            0x41, 0x00, // flag = 65
            0x04, 0x00, 0x00, 0x00, // l_seq = 4
            0x01, 0x00, 0x00, 0x00, // next_ref_id = 1
            0x15, 0x00, 0x00, 0x00, // next_pos = 21
            0x90, 0x00, 0x00, 0x00, // tlen = 144
            b'r', b'0', 0x00, // read_name = "r0\x00"
            0x30, 0x00, 0x00, 0x00, // cigar[0] = 3M
            0x14, 0x00, 0x00, 0x00, // cigar[1] = 1S
            0x12, 0x48, // seq = ACGT
            0x2d, 0x23, 0x2b, 0x32, // qual = NDLS
            b'N', b'H', b'C', 0x01, // data[0] = NH:i:1
        ];

        assert_eq!(buf, expected);

        Ok(())
    }

    #[test]
    fn test_encode_with_oversized_cigar() -> Result<(), Box<dyn std::error::Error>> {
        use sam::alignment::{
            record::{
                Flags,
                cigar::{Op, op::Kind},
                data::field::Tag,
            },
            record_buf::{Cigar, Sequence, data::field::Value},
        };

        const BASE_COUNT: usize = 65536;

        let mut buf = Vec::new();

        let header = sam::Header::builder()
            .add_reference_sequence(
                "sq0",
                Map::<ReferenceSequence>::new(
                    const { NonZero::new(131_072).expect("non-zero value 131_072") },
                ),
            )
            .build();

        let cigar = Cigar::from(vec![Op::new(Kind::Match, 1); BASE_COUNT]);
        let sequence = Sequence::from(vec![b'A'; BASE_COUNT]);

        let record = RecordBuf::builder()
            .set_flags(Flags::empty())
            .set_reference_sequence_id(0)
            .set_alignment_start(Position::MIN)
            .set_cigar(cigar)
            .set_sequence(sequence)
            .set_data([(Tag::ALIGNMENT_HIT_COUNT, Value::from(1))].into_iter().collect())
            .build();

        encode(&mut buf, &header, &record)?;

        let mut expected = vec![
            0x00, 0x00, 0x00, 0x00, // ref_id = 0
            0x00, 0x00, 0x00, 0x00, // pos = 1
            0x02, // l_read_name = 2
            0xff, // mapq = 255
            0x49, 0x02, // bin = 585
            0x02, 0x00, // n_cigar_op = 2
            0x00, 0x00, // flag = <empty>
            0x00, 0x00, 0x01, 0x00, // l_seq = 65536
            0xff, 0xff, 0xff, 0xff, // next_ref_id = -1
            0xff, 0xff, 0xff, 0xff, // next_pos = -1
            0x00, 0x00, 0x00, 0x00, // tlen = 0
            b'*', 0x00, // read_name = "*\x00"
            0x04, 0x00, 0x10, 0x00, // cigar[0] = 65536S
            0x03, 0x00, 0x10, 0x00, // cigar[1] = 65536N
        ];

        expected.resize(expected.len() + BASE_COUNT.div_ceil(2), 0x11); // seq = [A, ...]
        expected.resize(expected.len() + BASE_COUNT, 0xff); // qual = [0xff, ...]
        expected.extend([b'N', b'H', b'C', 0x01]); // data[0] = NH:i:1

        // data[1] = CG:B,I:...
        expected.extend([b'C', b'G', b'B', b'I', 0x00, 0x00, 0x01, 0x00]);
        expected.extend((0..BASE_COUNT).flat_map(|_| {
            [
                0x10, 0x00, 0x00, 0x00, // 1M
            ]
        }));

        assert_eq!(buf, expected);

        Ok(())
    }
}