fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
//! Object pool for FASTQ records to enable memory reuse.
//!
//! This module provides `ReadPool`, a simple object pool that allows
//! reusing `OwnedRecord` instances instead of allocating new ones for
//! every read. This significantly reduces heap allocations when processing
//! large FASTQ files.
//!
//! ## Usage
//!
//! ```ignore
//! use fastars::io::pool::ReadPool;
//!
//! let mut pool = ReadPool::new(256); // Hint for typical read length
//!
//! // Acquire a record (from pool or newly allocated)
//! let mut record = pool.acquire();
//! record.set_from(b"read1", b"ACGT", b"IIII");
//!
//! // Use the record...
//!
//! // Return it to the pool for reuse
//! pool.release(record);
//! ```
//!
//! ## Thread Safety
//!
//! `ReadPool` is NOT thread-safe and should be used as thread-local storage.
//! Each worker thread should have its own pool instance.

use super::OwnedRecord;

/// Default pool capacity (number of records to keep in the pool).
const DEFAULT_POOL_SIZE: usize = 1024;

/// A pool of reusable `OwnedRecord` instances.
///
/// This pool maintains a collection of cleared records that can be
/// reused instead of allocating new ones. When `acquire()` is called,
/// a record is taken from the pool if available, or a new one is created.
/// When `release()` is called, the record is cleared and returned to the pool.
#[derive(Debug)]
pub struct ReadPool {
    /// Pool of reusable records.
    pool: Vec<OwnedRecord>,
    /// Hint for the typical capacity needed for records.
    capacity_hint: usize,
    /// Maximum number of records to keep in the pool.
    max_pool_size: usize,
}

impl ReadPool {
    /// Create a new pool with the given capacity hint.
    ///
    /// The capacity hint is used when creating new records to pre-allocate
    /// vectors, reducing the need for reallocations as data is added.
    ///
    /// # Arguments
    ///
    /// * `capacity_hint` - Typical length of reads (e.g., 150 for Illumina short reads)
    #[inline]
    pub fn new(capacity_hint: usize) -> Self {
        Self {
            pool: Vec::with_capacity(DEFAULT_POOL_SIZE),
            capacity_hint,
            max_pool_size: DEFAULT_POOL_SIZE,
        }
    }

    /// Create a new pool with custom size limits.
    ///
    /// # Arguments
    ///
    /// * `capacity_hint` - Typical length of reads
    /// * `max_pool_size` - Maximum records to keep in the pool
    #[inline]
    pub fn with_max_size(capacity_hint: usize, max_pool_size: usize) -> Self {
        Self {
            pool: Vec::with_capacity(max_pool_size),
            capacity_hint,
            max_pool_size,
        }
    }

    /// Acquire a record from the pool.
    ///
    /// If the pool has available records, one is returned (already cleared).
    /// Otherwise, a new record is created with pre-allocated capacity.
    #[inline]
    pub fn acquire(&mut self) -> OwnedRecord {
        self.pool
            .pop()
            .unwrap_or_else(|| OwnedRecord::with_capacity(self.capacity_hint))
    }

    /// Release a record back to the pool.
    ///
    /// The record is cleared (but retains its capacity) and added to the pool
    /// for future reuse. If the pool is at capacity, the record is dropped.
    #[inline]
    pub fn release(&mut self, mut record: OwnedRecord) {
        if self.pool.len() < self.max_pool_size {
            record.clear();
            self.pool.push(record);
        }
        // If pool is full, record is dropped (its memory is freed)
    }

    /// Release multiple records back to the pool.
    ///
    /// This is more efficient than calling `release()` multiple times
    /// when returning a batch of records.
    #[inline]
    pub fn release_batch(&mut self, records: Vec<OwnedRecord>) {
        let available_space = self.max_pool_size.saturating_sub(self.pool.len());
        let to_keep = records.len().min(available_space);

        for mut record in records.into_iter().take(to_keep) {
            record.clear();
            self.pool.push(record);
        }
    }

    /// Get the current number of records in the pool.
    #[inline]
    pub fn len(&self) -> usize {
        self.pool.len()
    }

    /// Check if the pool is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.pool.is_empty()
    }

    /// Get the capacity hint used for new records.
    #[inline]
    pub fn capacity_hint(&self) -> usize {
        self.capacity_hint
    }

    /// Clear the pool, freeing all stored records.
    #[inline]
    pub fn clear(&mut self) {
        self.pool.clear();
    }

    /// Pre-populate the pool with empty records.
    ///
    /// This can be useful to avoid allocation pauses during processing.
    ///
    /// # Arguments
    ///
    /// * `count` - Number of records to pre-allocate
    pub fn prefill(&mut self, count: usize) {
        let to_add = count.min(self.max_pool_size.saturating_sub(self.pool.len()));
        self.pool.reserve(to_add);
        for _ in 0..to_add {
            self.pool.push(OwnedRecord::with_capacity(self.capacity_hint));
        }
    }
}

impl Default for ReadPool {
    fn default() -> Self {
        Self::new(256) // Default for typical short reads
    }
}

/// A fixed-size batch of records that can be reused.
///
/// Unlike `Vec<OwnedRecord>`, this batch pre-allocates all records upfront
/// and reuses them across iterations, avoiding per-batch allocations.
#[derive(Debug)]
pub struct FixedBatch {
    /// Pre-allocated records
    records: Vec<OwnedRecord>,
    /// Number of valid records (records[0..len] are valid)
    len: usize,
}

impl FixedBatch {
    /// Create a new fixed batch with the given capacity.
    ///
    /// # Arguments
    /// * `capacity` - Maximum number of records this batch can hold
    /// * `read_capacity` - Initial capacity for each record's seq/qual vectors
    pub fn new(capacity: usize, read_capacity: usize) -> Self {
        let records = (0..capacity)
            .map(|_| OwnedRecord::with_capacity(read_capacity))
            .collect();
        Self { records, len: 0 }
    }

    /// Get a mutable reference to the record at the given index.
    ///
    /// # Panics
    /// Panics if `idx >= capacity`.
    #[inline]
    pub fn get_mut(&mut self, idx: usize) -> &mut OwnedRecord {
        &mut self.records[idx]
    }

    /// Get a reference to the record at the given index.
    ///
    /// # Panics
    /// Panics if `idx >= len`.
    #[inline]
    pub fn get(&self, idx: usize) -> &OwnedRecord {
        debug_assert!(idx < self.len, "index out of bounds");
        &self.records[idx]
    }

    /// Set the number of valid records in this batch.
    #[inline]
    pub fn set_len(&mut self, len: usize) {
        debug_assert!(len <= self.records.len(), "len exceeds capacity");
        self.len = len;
    }

    /// Get the number of valid records in this batch.
    #[inline]
    pub fn len(&self) -> usize {
        self.len
    }

    /// Check if the batch is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    /// Get the capacity of this batch.
    #[inline]
    pub fn capacity(&self) -> usize {
        self.records.len()
    }

    /// Clear the batch, marking all records as invalid but retaining capacity.
    #[inline]
    pub fn clear(&mut self) {
        for record in &mut self.records[..self.len] {
            record.clear();
        }
        self.len = 0;
    }

    /// Get an iterator over valid records.
    #[inline]
    pub fn iter(&self) -> impl Iterator<Item = &OwnedRecord> {
        self.records[..self.len].iter()
    }

    /// Get a mutable iterator over valid records.
    #[inline]
    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut OwnedRecord> {
        self.records[..self.len].iter_mut()
    }

    /// Convert to a Vec, consuming the batch.
    /// This is mainly for compatibility with existing code.
    pub fn into_vec(self) -> Vec<OwnedRecord> {
        let mut v = self.records;
        v.truncate(self.len);
        v
    }

    /// Get a slice of valid records.
    #[inline]
    pub fn as_slice(&self) -> &[OwnedRecord] {
        &self.records[..self.len]
    }
}

/// Pool of reusable fixed batches.
#[derive(Debug)]
pub struct BatchPool {
    batches: Vec<FixedBatch>,
    batch_capacity: usize,
    read_capacity: usize,
}

impl BatchPool {
    /// Create a new batch pool.
    ///
    /// # Arguments
    /// * `batch_capacity` - Number of records per batch
    /// * `read_capacity` - Initial capacity for each record's vectors
    pub fn new(batch_capacity: usize, read_capacity: usize) -> Self {
        Self {
            batches: Vec::new(),
            batch_capacity,
            read_capacity,
        }
    }

    /// Acquire a batch from the pool or create a new one.
    pub fn acquire(&mut self) -> FixedBatch {
        self.batches.pop().unwrap_or_else(|| {
            FixedBatch::new(self.batch_capacity, self.read_capacity)
        })
    }

    /// Release a batch back to the pool for reuse.
    pub fn release(&mut self, mut batch: FixedBatch) {
        batch.clear();
        self.batches.push(batch);
    }

    /// Get the number of batches currently in the pool.
    pub fn len(&self) -> usize {
        self.batches.len()
    }

    /// Check if the pool is empty.
    pub fn is_empty(&self) -> bool {
        self.batches.is_empty()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pool_new() {
        let pool = ReadPool::new(150);
        assert!(pool.is_empty());
        assert_eq!(pool.capacity_hint(), 150);
    }

    #[test]
    fn test_pool_acquire_creates_new() {
        let mut pool = ReadPool::new(150);
        let record = pool.acquire();

        // Should have pre-allocated capacity
        assert!(record.seq.capacity() >= 150);
        assert!(record.qual.capacity() >= 150);
        assert!(record.is_empty());
    }

    #[test]
    fn test_pool_release_and_acquire() {
        let mut pool = ReadPool::new(150);

        // Create and populate a record
        let mut record = pool.acquire();
        record.set_from(b"read1", b"ACGT", b"IIII");

        // Release it
        pool.release(record);
        assert_eq!(pool.len(), 1);

        // Acquire again - should get the same memory
        let record = pool.acquire();
        assert!(pool.is_empty());
        assert!(record.is_empty()); // Should be cleared
        assert!(record.seq.capacity() >= 4); // But retains capacity
    }

    #[test]
    fn test_pool_max_size() {
        let mut pool = ReadPool::with_max_size(100, 2);

        // Fill the pool
        pool.release(OwnedRecord::with_capacity(100));
        pool.release(OwnedRecord::with_capacity(100));
        assert_eq!(pool.len(), 2);

        // Try to add one more - should be dropped
        pool.release(OwnedRecord::with_capacity(100));
        assert_eq!(pool.len(), 2); // Still 2
    }

    #[test]
    fn test_pool_release_batch() {
        let mut pool = ReadPool::with_max_size(100, 5);

        let records: Vec<_> = (0..10)
            .map(|_| OwnedRecord::with_capacity(100))
            .collect();

        pool.release_batch(records);
        assert_eq!(pool.len(), 5); // Only keeps up to max_pool_size
    }

    #[test]
    fn test_pool_prefill() {
        let mut pool = ReadPool::with_max_size(100, 10);
        pool.prefill(5);
        assert_eq!(pool.len(), 5);

        // Try to prefill beyond max
        pool.prefill(10);
        assert_eq!(pool.len(), 10); // Capped at max
    }

    #[test]
    fn test_pool_clear() {
        let mut pool = ReadPool::new(100);
        pool.prefill(5);
        assert_eq!(pool.len(), 5);

        pool.clear();
        assert!(pool.is_empty());
    }

    #[test]
    fn test_pool_default() {
        let pool = ReadPool::default();
        assert_eq!(pool.capacity_hint(), 256);
    }

    #[test]
    fn test_record_reuse_preserves_capacity() {
        let mut pool = ReadPool::new(256);

        // Create a record with larger data
        let mut record = pool.acquire();
        let long_seq = vec![b'A'; 500];
        let long_qual = vec![b'I'; 500];
        record.set_from(b"read1", &long_seq, &long_qual);

        // Remember the capacity after growth
        let seq_cap = record.seq.capacity();
        let qual_cap = record.qual.capacity();

        // Release and reacquire
        pool.release(record);
        let record = pool.acquire();

        // Capacity should be preserved
        assert!(record.seq.capacity() >= seq_cap);
        assert!(record.qual.capacity() >= qual_cap);
        assert!(record.is_empty()); // But data cleared
    }
}

#[cfg(test)]
mod fixed_batch_tests {
    use super::*;

    #[test]
    fn test_fixed_batch_creation() {
        let batch = FixedBatch::new(10, 256);
        assert_eq!(batch.capacity(), 10);
        assert_eq!(batch.len(), 0);
        assert!(batch.is_empty());
    }

    #[test]
    fn test_fixed_batch_usage() {
        let mut batch = FixedBatch::new(3, 64);

        // Fill records
        batch.get_mut(0).set_from(b"read1", b"ACGT", b"IIII");
        batch.get_mut(1).set_from(b"read2", b"TGCA", b"HHHH");
        batch.set_len(2);

        assert_eq!(batch.len(), 2);
        assert_eq!(batch.get(0).name, b"read1");
        assert_eq!(batch.get(1).seq, b"TGCA");
    }

    #[test]
    fn test_fixed_batch_clear() {
        let mut batch = FixedBatch::new(2, 64);
        batch.get_mut(0).set_from(b"read1", b"ACGT", b"IIII");
        batch.set_len(1);

        batch.clear();
        assert!(batch.is_empty());
        // Capacity still preserved
        assert!(batch.get_mut(0).seq.capacity() >= 64);
    }

    #[test]
    fn test_fixed_batch_iter() {
        let mut batch = FixedBatch::new(3, 64);
        batch.get_mut(0).set_from(b"a", b"A", b"I");
        batch.get_mut(1).set_from(b"b", b"C", b"H");
        batch.set_len(2);

        let names: Vec<_> = batch.iter().map(|r| &r.name).collect();
        assert_eq!(names.len(), 2);
    }

    #[test]
    fn test_batch_pool() {
        let mut pool = BatchPool::new(10, 256);
        assert!(pool.is_empty());

        let batch1 = pool.acquire();
        assert_eq!(batch1.capacity(), 10);

        pool.release(batch1);
        assert_eq!(pool.len(), 1);

        let batch2 = pool.acquire();
        assert!(pool.is_empty());
        assert_eq!(batch2.capacity(), 10);
    }
}