seq_geom_parser 1.1.0

Parser and extractor for sequencing read geometry descriptions
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
//! Performance benchmarks for geometry extraction.
//!
//! These benchmarks ensure extraction stays fast (the hot path runs
//! hundreds of millions to billions of times per dataset). Run with:
//!
//!   cargo bench -p seq_geom_parser
//!
//! Results are saved to `target/criterion/` with HTML reports.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use seq_geom_parser::{parse_geometry, CompiledGeom};

/// Generate a synthetic read of the given length filled with random-ish bases.
fn make_read(len: usize, seed: u8) -> Vec<u8> {
    let bases = [b'A', b'C', b'G', b'T'];
    (0..len)
        .map(|i| bases[((i as u8).wrapping_add(seed)) as usize % 4])
        .collect()
}

/// Build a Flex v1 R2 read: 50bp bio + 18bp linker + 8bp sample BC + rest
fn make_flex_v1_r2(sample_bc: &[u8; 8]) -> Vec<u8> {
    let mut r2 = make_read(50, 42); // bio read
    r2.extend_from_slice(&[b'N'; 18]); // linker
    r2.extend_from_slice(sample_bc);
    r2.extend_from_slice(&[b'N'; 10]); // extra
    r2
}

/// Build a Flex v2 R1 read: 16bp BC + 12bp UMI + gap + anchor + 10bp sample BC + rest
fn make_flex_v2_r1(gap_len: usize, anchor: &[u8], sample_bc: &[u8; 10]) -> Vec<u8> {
    let mut r1 = make_read(16, 1); // cell BC
    r1.extend(make_read(12, 2)); // UMI
    r1.extend(vec![b'N'; gap_len]); // variable gap
    r1.extend_from_slice(anchor); // anchor
    r1.extend_from_slice(sample_bc);
    r1.extend(vec![b'N'; 20]); // extra
    r1
}

/// Build a boundary-resolved R1 read: biological prefix + anchor + variable barcode.
fn make_boundary_resolved_r1(prefix_len: usize, anchor: &[u8], barcode: &[u8]) -> Vec<u8> {
    let mut r1 = make_read(prefix_len, 7);
    r1.extend_from_slice(anchor);
    r1.extend_from_slice(barcode);
    r1
}

fn bench_parse(c: &mut Criterion) {
    let mut group = c.benchmark_group("parse");

    let geometries = [
        ("chromium_v3", "1{b[16]u[12]x:}2{r:}"),
        ("flex_v1", "1{b[16]u[12]x:}2{r[50]x[18]s[8]x:}"),
        ("flex_v2", "1{b[16]u[12]x[0-3]f[TTGCTAGGACCG]s[10]x:}2{r:}"),
        (
            "flex_v2_hamming",
            "1{b[16]u[12]x[0-3]hamming(f[TTGCTAGGACCG],1)s[10]x:}2{r:}",
        ),
    ];

    for (name, geom_str) in &geometries {
        group.bench_with_input(BenchmarkId::new("parse", name), geom_str, |b, g| {
            b.iter(|| parse_geometry(black_box(g)).unwrap());
        });
    }

    group.finish();
}

fn bench_compile(c: &mut Criterion) {
    let mut group = c.benchmark_group("compile");

    let geometries = [
        ("chromium_v3", "1{b[16]u[12]x:}2{r:}"),
        ("flex_v1", "1{b[16]u[12]x:}2{r[50]x[18]s[8]x:}"),
        ("flex_v2", "1{b[16]u[12]x[0-3]f[TTGCTAGGACCG]s[10]x:}2{r:}"),
    ];

    for (name, geom_str) in &geometries {
        let geom = parse_geometry(geom_str).unwrap();
        group.bench_with_input(BenchmarkId::new("compile", name), &geom, |b, g| {
            b.iter(|| CompiledGeom::from_fragment_geom(black_box(g)).unwrap());
        });
    }

    group.finish();
}

fn bench_extract_fixed(c: &mut Criterion) {
    let mut group = c.benchmark_group("extract_fixed");
    group.throughput(Throughput::Elements(1));

    // Chromium v3: simplest case (all fixed offsets)
    let geom = parse_geometry("1{b[16]u[12]x:}2{r:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let r1 = make_read(150, 1);
    let r2 = make_read(150, 2);

    group.bench_function("chromium_v3", |b| {
        b.iter(|| compiled.extract(black_box(&r1), black_box(&r2)));
    });

    // Flex v1: two reads with fixed offsets
    let geom = parse_geometry("1{b[16]u[12]x:}2{r[50]x[18]s[8]x:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let r1 = make_read(150, 1);
    let r2 = make_flex_v1_r2(b"ACTTTAGG");

    group.bench_function("flex_v1", |b| {
        b.iter(|| compiled.extract(black_box(&r1), black_box(&r2)));
    });

    group.finish();
}

fn bench_extract_anchor(c: &mut Criterion) {
    let mut group = c.benchmark_group("extract_anchor");
    group.throughput(Throughput::Elements(1));

    let anchor = b"TTGCTAGGACCG";
    let sample_bc = b"SAMPLEBC10";

    // Flex v2 with exact anchor match
    let geom = parse_geometry("1{b[16]u[12]x[0-3]f[TTGCTAGGACCG]s[10]x:}2{r:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let r2 = make_read(150, 2);

    for gap in [0, 1, 2, 3] {
        let r1 = make_flex_v2_r1(gap, anchor, sample_bc);
        group.bench_with_input(
            BenchmarkId::new("exact_gap", gap),
            &(r1, r2.clone()),
            |b, (r1, r2)| {
                b.iter(|| compiled.extract(black_box(r1), black_box(r2)));
            },
        );
    }

    // Flex v2 with hamming(1) tolerance
    let geom = parse_geometry("1{b[16]u[12]x[0-3]hamming(f[TTGCTAGGACCG],1)s[10]x:}2{r:}").unwrap();
    let compiled_h1 = CompiledGeom::from_fragment_geom(&geom).unwrap();

    // Exact match with hamming enabled (should still be fast)
    let r1_exact = make_flex_v2_r1(2, anchor, sample_bc);
    group.bench_function("hamming1_exact", |b| {
        b.iter(|| compiled_h1.extract(black_box(&r1_exact), black_box(&r2)));
    });

    // 1 mismatch in anchor
    let mut anchor_mut = anchor.to_vec();
    anchor_mut[11] = b'A'; // last base mutated
    let r1_mismatch = make_flex_v2_r1(2, &anchor_mut, sample_bc);
    group.bench_function("hamming1_mismatch", |b| {
        b.iter(|| compiled_h1.extract(black_box(&r1_mismatch), black_box(&r2)));
    });

    // >1 mismatches near the front should reject quickly.
    let mut anchor_early_reject = anchor.to_vec();
    anchor_early_reject[0] = b'C';
    anchor_early_reject[1] = b'A';
    let r1_early_reject = make_flex_v2_r1(2, &anchor_early_reject, sample_bc);
    group.bench_function("hamming1_early_reject", |b| {
        b.iter(|| compiled_h1.extract(black_box(&r1_early_reject), black_box(&r2)));
    });

    // Anchor not found (worst case: searches all positions)
    let r1_nofind = make_read(80, 99);
    group.bench_function("anchor_not_found", |b| {
        b.iter(|| compiled.extract(black_box(&r1_nofind), black_box(&r2)));
    });

    group.finish();
}

fn bench_extract_boundary_resolved(c: &mut Criterion) {
    let mut group = c.benchmark_group("extract_boundary_resolved");
    group.throughput(Throughput::Elements(1));

    let geom = parse_geometry("1{r:f[ACAGT]b[9-11]}2{u[12]x:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let anchor = b"ACAGT";
    let umi_r2 = b"TTTTTTTTTTTT";

    for barcode_len in [9usize, 10, 11] {
        let barcode = &b"BARCODE12345"[..barcode_len];
        let r1 = make_boundary_resolved_r1(24, anchor, barcode);
        let mut r2 = Vec::new();
        r2.extend_from_slice(umi_r2);
        r2.extend_from_slice(b"tail");
        group.bench_with_input(
            BenchmarkId::new("single_anchor_suffix_bc", barcode_len),
            &(r1, r2),
            |b, (r1, r2)| {
                b.iter(|| compiled.extract(black_box(r1), black_box(r2)));
            },
        );
    }

    // Repeated anchors exercise the leftmost-best tie break path.
    let repeated_anchor_r1 = {
        let mut r1 = make_read(20, 11);
        r1.extend_from_slice(anchor);
        r1.extend_from_slice(b"ACGT");
        r1.extend_from_slice(anchor);
        r1.extend_from_slice(b"BARCODE09");
        r1
    };
    let mut r2 = Vec::new();
    r2.extend_from_slice(umi_r2);
    r2.extend_from_slice(b"tail");
    group.bench_function("repeated_anchor_tiebreak", |b| {
        b.iter(|| compiled.extract(black_box(&repeated_anchor_r1), black_box(&r2)));
    });

    group.finish();
}

/// Hardcoded extraction mimicking ChromiumProtocol::extract_tech_seqs exactly.
/// This is the absolute minimum work: two bounds checks, two slices, return.
#[inline(never)]
fn hardcoded_chromium_v3_extract<'a>(
    r1: &'a [u8],
    r2: &'a [u8],
) -> (Option<&'a [u8]>, Option<&'a [u8]>, &'a [u8]) {
    let bc_len = 16;
    let umi_len = 12;
    let barcode = if r1.len() >= bc_len {
        Some(&r1[..bc_len])
    } else {
        None
    };
    let umi = if r1.len() >= bc_len + umi_len {
        Some(&r1[bc_len..bc_len + umi_len])
    } else {
        None
    };
    (barcode, umi, r2)
}

/// Same but for Chromium v2 (16bp BC, 10bp UMI).
#[inline(never)]
fn hardcoded_chromium_v2_extract<'a>(
    r1: &'a [u8],
    r2: &'a [u8],
) -> (Option<&'a [u8]>, Option<&'a [u8]>, &'a [u8]) {
    let bc_len = 16;
    let umi_len = 10;
    let barcode = if r1.len() >= bc_len {
        Some(&r1[..bc_len])
    } else {
        None
    };
    let umi = if r1.len() >= bc_len + umi_len {
        Some(&r1[bc_len..bc_len + umi_len])
    } else {
        None
    };
    (barcode, umi, r2)
}

fn bench_hardcoded_vs_compiled(c: &mut Criterion) {
    let mut group = c.benchmark_group("hardcoded_vs_compiled");
    group.throughput(Throughput::Elements(1));

    let r1 = make_read(150, 1);
    let r2 = make_read(150, 2);

    // Hardcoded v3
    group.bench_function("hardcoded_v3", |b| {
        b.iter(|| hardcoded_chromium_v3_extract(black_box(&r1), black_box(&r2)));
    });

    // Compiled v3 via enum dispatch
    let geom_v3 = parse_geometry("1{b[16]u[12]x:}2{r:}").unwrap();
    let compiled_v3 = CompiledGeom::from_fragment_geom(&geom_v3).unwrap();
    group.bench_function("compiled_v3_dispatch", |b| {
        b.iter(|| compiled_v3.extract(black_box(&r1), black_box(&r2)));
    });

    // Compiled v3 via direct variant call (no enum branch)
    let simple_v3 = match &compiled_v3 {
        CompiledGeom::Simple(ext) => ext,
        _ => panic!("expected Simple variant for v3"),
    };
    group.bench_function("compiled_v3_direct", |b| {
        b.iter(|| simple_v3.extract(black_box(&r1), black_box(&r2)));
    });

    // Hardcoded v2
    group.bench_function("hardcoded_v2", |b| {
        b.iter(|| hardcoded_chromium_v2_extract(black_box(&r1), black_box(&r2)));
    });

    // Compiled v2 via enum dispatch
    let geom_v2 = parse_geometry("1{b[16]u[10]x:}2{r:}").unwrap();
    let compiled_v2 = CompiledGeom::from_fragment_geom(&geom_v2).unwrap();
    group.bench_function("compiled_v2_dispatch", |b| {
        b.iter(|| compiled_v2.extract(black_box(&r1), black_box(&r2)));
    });

    // Compiled v2 via direct variant call
    let simple_v2 = match &compiled_v2 {
        CompiledGeom::Simple(ext) => ext,
        _ => panic!("expected Simple variant for v2"),
    };
    group.bench_function("compiled_v2_direct", |b| {
        b.iter(|| simple_v2.extract(black_box(&r1), black_box(&r2)));
    });

    group.finish();
}

fn bench_hardcoded_vs_compiled_batch(c: &mut Criterion) {
    let mut group = c.benchmark_group("hardcoded_vs_compiled_batch");
    let batch_size = 100_000usize;
    group.throughput(Throughput::Elements(batch_size as u64));

    let reads: Vec<(Vec<u8>, Vec<u8>)> = (0..batch_size)
        .map(|i| (make_read(150, i as u8), make_read(150, (i + 1) as u8)))
        .collect();

    // Hardcoded v3 batch
    group.bench_function("hardcoded_v3_100k", |b| {
        b.iter(|| {
            for (r1, r2) in &reads {
                black_box(hardcoded_chromium_v3_extract(r1, r2));
            }
        });
    });

    // Compiled v3 batch
    let geom = parse_geometry("1{b[16]u[12]x:}2{r:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    group.bench_function("compiled_v3_100k", |b| {
        b.iter(|| {
            for (r1, r2) in &reads {
                black_box(compiled.extract(r1, r2));
            }
        });
    });

    group.finish();
}

fn bench_extract_throughput(c: &mut Criterion) {
    let mut group = c.benchmark_group("throughput");

    // Simulate processing a batch of reads
    let batch_size = 10_000usize;
    group.throughput(Throughput::Elements(batch_size as u64));

    // Chromium v3
    let geom = parse_geometry("1{b[16]u[12]x:}2{r:}").unwrap();
    let compiled = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let reads: Vec<(Vec<u8>, Vec<u8>)> = (0..batch_size)
        .map(|i| (make_read(150, i as u8), make_read(150, (i + 1) as u8)))
        .collect();

    group.bench_function("chromium_v3_10k", |b| {
        b.iter(|| {
            for (r1, r2) in &reads {
                black_box(compiled.extract(r1, r2));
            }
        });
    });

    // Flex v2 with anchor
    let anchor = b"TTGCTAGGACCG";
    let sample_bc = b"SAMPLEBC10";
    let geom = parse_geometry("1{b[16]u[12]x[0-3]f[TTGCTAGGACCG]s[10]x:}2{r:}").unwrap();
    let compiled_v2 = CompiledGeom::from_fragment_geom(&geom).unwrap();
    let reads_v2: Vec<(Vec<u8>, Vec<u8>)> = (0..batch_size)
        .map(|i| {
            let gap = i % 4; // vary gap length
            (
                make_flex_v2_r1(gap, anchor, sample_bc),
                make_read(150, (i + 1) as u8),
            )
        })
        .collect();

    group.bench_function("flex_v2_anchor_10k", |b| {
        b.iter(|| {
            for (r1, r2) in &reads_v2 {
                black_box(compiled_v2.extract(r1, r2));
            }
        });
    });

    group.finish();
}

criterion_group!(
    benches,
    bench_parse,
    bench_compile,
    bench_extract_fixed,
    bench_extract_anchor,
    bench_extract_boundary_resolved,
    bench_hardcoded_vs_compiled,
    bench_hardcoded_vs_compiled_batch,
    bench_extract_throughput,
);
criterion_main!(benches);