fastars 0.1.0

Ultra-fast QC and trimming for short and long reads
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
//! Duplication estimation.
//!
//! This module provides estimation of sequence duplication levels
//! using sampling-based approaches.

use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};

/// Serde support for FxHashMap<u64, u64>
mod fx_hashmap_serde {
    use rustc_hash::FxHashMap;
    use serde::{Deserialize, Deserializer, Serialize, Serializer};
    use std::collections::HashMap;

    pub fn serialize<S>(map: &FxHashMap<u64, u64>, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        // Convert to std HashMap for serialization
        let std_map: HashMap<u64, u64> = map.iter().map(|(&k, &v)| (k, v)).collect();
        std_map.serialize(serializer)
    }

    pub fn deserialize<'de, D>(deserializer: D) -> Result<FxHashMap<u64, u64>, D::Error>
    where
        D: Deserializer<'de>,
    {
        let std_map = HashMap::<u64, u64>::deserialize(deserializer)?;
        Ok(std_map.into_iter().collect())
    }
}

/// Default number of reads to sample for duplication analysis.
const DEFAULT_SAMPLE_SIZE: u64 = 100_000;

/// A simple hash function for sequences.
/// Uses FNV-1a for fast hashing of byte sequences.
fn hash_sequence(seq: &[u8]) -> u64 {
    // FNV-1a hash
    const FNV_OFFSET: u64 = 0xcbf29ce484222325;
    const FNV_PRIME: u64 = 0x100000001b3;

    let mut hash = FNV_OFFSET;
    for &byte in seq {
        hash ^= byte.to_ascii_uppercase() as u64;
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    hash
}

/// Duplication statistics.
///
/// Samples the first N reads and estimates duplication rate
/// by tracking sequence hashes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicationStats {
    /// Sequence hash -> count mapping
    #[serde(with = "fx_hashmap_serde")]
    sequence_counts: FxHashMap<u64, u64>,
    /// Maximum number of reads to sample
    sample_size: u64,
    /// Number of reads processed so far
    reads_processed: u64,
    /// Whether we're still in sampling mode
    sampling_active: bool,
    /// Duplication level histogram (1-10+)
    /// Index 0 = unique (1 copy), Index 9 = 10+ copies
    duplication_histogram: [u64; 10],
    /// Flag indicating histogram needs recalculation
    histogram_dirty: bool,
    /// Whether duplication evaluation is enabled
    enabled: bool,
}

impl Default for DuplicationStats {
    fn default() -> Self {
        Self::new()
    }
}

impl DuplicationStats {
    /// Create a new duplication statistics container with default settings.
    ///
    /// Samples first 100,000 reads.
    pub fn new() -> Self {
        Self::with_sample_size(DEFAULT_SAMPLE_SIZE)
    }

    /// Create a new duplication statistics container with custom sample size.
    pub fn with_sample_size(sample_size: u64) -> Self {
        Self {
            sequence_counts: FxHashMap::default(),
            sample_size,
            reads_processed: 0,
            sampling_active: true,
            duplication_histogram: [0u64; 10],
            histogram_dirty: true,
            enabled: true,
        }
    }

    /// Create a disabled duplication statistics container.
    ///
    /// This container will not collect any statistics and uses minimal memory.
    pub fn disabled() -> Self {
        Self {
            sequence_counts: FxHashMap::default(),
            sample_size: 0,
            reads_processed: 0,
            sampling_active: false,
            duplication_histogram: [0u64; 10],
            histogram_dirty: false,
            enabled: false,
        }
    }

    /// Update statistics with a sequence.
    ///
    /// Only processes reads until sample_size is reached.
    #[inline]
    pub fn update(&mut self, seq: &[u8], read_count: u64) {
        // Skip if disabled
        if !self.enabled {
            return;
        }

        // Check if we should still be sampling
        if !self.sampling_active || read_count > self.sample_size {
            self.sampling_active = false;
            return;
        }

        if seq.is_empty() {
            return;
        }

        let hash = hash_sequence(seq);
        *self.sequence_counts.entry(hash).or_insert(0) += 1;
        self.reads_processed += 1;
        self.histogram_dirty = true;
    }

    /// Calculate the duplication rate.
    ///
    /// Returns the percentage of reads that are duplicates (0.0 to 100.0).
    pub fn duplication_rate(&self) -> f64 {
        if self.reads_processed == 0 {
            return 0.0;
        }

        let unique_sequences = self.sequence_counts.len() as u64;
        let duplicates = self.reads_processed - unique_sequences;

        (duplicates as f64 / self.reads_processed as f64) * 100.0
    }

    /// Get the number of unique sequences.
    pub fn unique_count(&self) -> u64 {
        self.sequence_counts.len() as u64
    }

    /// Get the total reads processed (sampled).
    pub fn reads_processed(&self) -> u64 {
        self.reads_processed
    }

    /// Get the estimated unique rate.
    ///
    /// Returns the percentage of reads that are unique (0.0 to 100.0).
    pub fn unique_rate(&self) -> f64 {
        100.0 - self.duplication_rate()
    }

    /// Calculate and return the duplication histogram.
    ///
    /// Index 0 = sequences seen 1 time (unique)
    /// Index 1 = sequences seen 2 times
    /// ...
    /// Index 9 = sequences seen 10+ times
    pub fn duplication_histogram(&mut self) -> &[u64; 10] {
        if self.histogram_dirty {
            self.recalculate_histogram();
        }
        &self.duplication_histogram
    }

    /// Get duplication histogram without recalculating.
    pub fn histogram_snapshot(&self) -> [u64; 10] {
        if self.histogram_dirty {
            let mut hist = [0u64; 10];
            for &count in self.sequence_counts.values() {
                let idx = (count as usize).saturating_sub(1).min(9);
                hist[idx] += count; // Each sequence contributes 'count' reads
            }
            hist
        } else {
            self.duplication_histogram
        }
    }

    fn recalculate_histogram(&mut self) {
        self.duplication_histogram = [0u64; 10];
        for &count in self.sequence_counts.values() {
            let idx = (count as usize).saturating_sub(1).min(9);
            self.duplication_histogram[idx] += count;
        }
        self.histogram_dirty = false;
    }

    /// Get the percentage of reads in each duplication level.
    pub fn duplication_percentages(&mut self) -> [f64; 10] {
        if self.reads_processed == 0 {
            return [0.0; 10];
        }

        let total = self.reads_processed as f64;
        let hist = self.duplication_histogram();
        let mut percentages = [0.0f64; 10];

        for (i, &count) in hist.iter().enumerate() {
            percentages[i] = (count as f64 / total) * 100.0;
        }

        percentages
    }

    /// Merge statistics from another DuplicationStats instance.
    ///
    /// Note: Merging duplication stats is approximate since the same
    /// sequence in different workers will have the same hash.
    pub fn merge(&mut self, other: &DuplicationStats) {
        for (&hash, &count) in &other.sequence_counts {
            *self.sequence_counts.entry(hash).or_insert(0) += count;
        }
        self.reads_processed += other.reads_processed;
        self.histogram_dirty = true;
    }

    /// Check if sampling is still active.
    pub fn is_sampling(&self) -> bool {
        self.sampling_active
    }

    /// Get the sample size limit.
    pub fn sample_size(&self) -> u64 {
        self.sample_size
    }

    /// Get sequences with highest duplication.
    ///
    /// Returns hashes and their counts, sorted by count descending.
    pub fn most_duplicated(&self, n: usize) -> Vec<(u64, u64)> {
        let mut sorted: Vec<(u64, u64)> = self
            .sequence_counts
            .iter()
            .map(|(&k, &v)| (k, v))
            .collect();

        sorted.sort_by(|a, b| b.1.cmp(&a.1));
        sorted.truncate(n);
        sorted
    }

    /// Get the mean duplication level.
    ///
    /// Returns the average number of times each unique sequence appears.
    pub fn mean_duplication(&self) -> f64 {
        let unique = self.sequence_counts.len() as u64;
        if unique == 0 {
            0.0
        } else {
            self.reads_processed as f64 / unique as f64
        }
    }

    /// Create a DuplicationStats from raw hash counts.
    ///
    /// This is used for converting from FastQcStats to QcStats.
    pub fn from_raw(sequence_counts: FxHashMap<u64, u64>, sample_size: u64) -> Self {
        let reads_processed: u64 = sequence_counts.values().sum();
        Self {
            sequence_counts,
            sample_size,
            reads_processed,
            sampling_active: false,
            duplication_histogram: [0u64; 10],
            histogram_dirty: true,
            enabled: true,
        }
    }

    /// Check if duplication evaluation is enabled.
    pub fn is_enabled(&self) -> bool {
        self.enabled
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_duplication_stats_new() {
        let ds = DuplicationStats::new();
        assert_eq!(ds.reads_processed(), 0);
        assert_eq!(ds.unique_count(), 0);
        assert!(ds.is_sampling());
    }

    #[test]
    fn test_duplication_stats_update_unique() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"GCTA", 2);
        ds.update(b"TTAA", 3);

        assert_eq!(ds.reads_processed(), 3);
        assert_eq!(ds.unique_count(), 3);
        assert!((ds.duplication_rate() - 0.0).abs() < 0.001);
    }

    #[test]
    fn test_duplication_stats_update_duplicates() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);
        ds.update(b"ATGC", 3);

        assert_eq!(ds.reads_processed(), 3);
        assert_eq!(ds.unique_count(), 1);
        // 2 out of 3 are duplicates = 66.67%
        assert!((ds.duplication_rate() - 66.666).abs() < 0.1);
    }

    #[test]
    fn test_duplication_stats_case_insensitive() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"atgc", 2);

        assert_eq!(ds.unique_count(), 1);
        assert_eq!(ds.reads_processed(), 2);
    }

    #[test]
    fn test_duplication_stats_sampling_limit() {
        let mut ds = DuplicationStats::with_sample_size(5);

        for i in 1..=10 {
            ds.update(format!("SEQ{}", i).as_bytes(), i);
        }

        assert_eq!(ds.reads_processed(), 5);
        assert!(!ds.is_sampling());
    }

    #[test]
    fn test_duplication_stats_unique_rate() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);
        ds.update(b"GCTA", 3);
        ds.update(b"TTAA", 4);

        // 3 unique out of 4 = 75% unique
        assert!((ds.unique_rate() - 75.0).abs() < 0.1);
    }

    #[test]
    fn test_duplication_stats_histogram() {
        let mut ds = DuplicationStats::new();
        // 1 unique sequence
        ds.update(b"UNIQUE", 1);
        // 1 sequence seen twice
        ds.update(b"DOUBLE", 2);
        ds.update(b"DOUBLE", 3);
        // 1 sequence seen 3 times
        ds.update(b"TRIPLE", 4);
        ds.update(b"TRIPLE", 5);
        ds.update(b"TRIPLE", 6);

        let hist = ds.duplication_histogram();
        assert_eq!(hist[0], 1); // 1 read with duplication level 1
        assert_eq!(hist[1], 2); // 2 reads with duplication level 2
        assert_eq!(hist[2], 3); // 3 reads with duplication level 3
    }

    #[test]
    fn test_duplication_stats_merge() {
        let mut ds1 = DuplicationStats::new();
        ds1.update(b"ATGC", 1);

        let mut ds2 = DuplicationStats::new();
        ds2.update(b"ATGC", 1);
        ds2.update(b"GCTA", 2);

        ds1.merge(&ds2);

        assert_eq!(ds1.reads_processed(), 3);
        // ATGC appears in both, so 1 unique with count 2
        // GCTA appears once
        // 2 unique sequences, 3 reads -> 1 duplicate
        assert_eq!(ds1.unique_count(), 2);
    }

    #[test]
    fn test_duplication_stats_empty() {
        let mut ds = DuplicationStats::new();
        ds.update(b"", 1);

        assert_eq!(ds.reads_processed(), 0);
        assert_eq!(ds.duplication_rate(), 0.0);
    }

    #[test]
    fn test_duplication_stats_most_duplicated() {
        let mut ds = DuplicationStats::new();
        for i in 1..=5 {
            ds.update(b"COMMON", i);
        }
        for i in 1..=2 {
            ds.update(b"LESS", i + 5);
        }
        ds.update(b"RARE", 8);

        let top = ds.most_duplicated(2);
        assert_eq!(top.len(), 2);
        assert_eq!(top[0].1, 5); // COMMON with 5 counts
        assert_eq!(top[1].1, 2); // LESS with 2 counts
    }

    #[test]
    fn test_duplication_stats_mean_duplication() {
        let mut ds = DuplicationStats::new();
        // 2 sequences, each seen 2 times = 4 reads
        ds.update(b"SEQ1", 1);
        ds.update(b"SEQ1", 2);
        ds.update(b"SEQ2", 3);
        ds.update(b"SEQ2", 4);

        // Mean duplication = 4 reads / 2 unique = 2.0
        assert!((ds.mean_duplication() - 2.0).abs() < 0.001);
    }

    #[test]
    fn test_duplication_stats_serialize() {
        let mut ds = DuplicationStats::new();
        ds.update(b"ATGC", 1);
        ds.update(b"ATGC", 2);

        let json = serde_json::to_string(&ds).unwrap();
        let ds2: DuplicationStats = serde_json::from_str(&json).unwrap();

        assert_eq!(ds.reads_processed(), ds2.reads_processed());
        assert_eq!(ds.unique_count(), ds2.unique_count());
    }

    #[test]
    fn test_hash_sequence_consistency() {
        let hash1 = hash_sequence(b"ATGC");
        let hash2 = hash_sequence(b"ATGC");
        let hash3 = hash_sequence(b"atgc");

        assert_eq!(hash1, hash2);
        assert_eq!(hash1, hash3); // Case insensitive
    }

    #[test]
    fn test_duplication_percentages() {
        let mut ds = DuplicationStats::new();
        // 50 unique reads
        for i in 1..=50 {
            ds.update(format!("UNIQUE{}", i).as_bytes(), i);
        }
        // 50 reads that are duplicates (25 sequences x 2)
        for i in 1..=25 {
            ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2) - 1);
            ds.update(format!("DUP{}", i).as_bytes(), 50 + (i * 2));
        }

        let percentages = ds.duplication_percentages();
        // 50% unique (index 0)
        assert!((percentages[0] - 50.0).abs() < 0.1);
        // 50% duplicated at level 2 (index 1)
        assert!((percentages[1] - 50.0).abs() < 0.1);
    }
}