Skip to main content

cyanea_seq/
assembly.rs

1//! Assembly quality-control metrics.
2//!
3//! Compute standard assembly statistics (N50, L50, N90, GC content, auN)
4//! from a set of contig sequences.
5
6use cyanea_core::{CyaneaError, Result};
7
8/// Assembly quality statistics.
9#[derive(Debug, Clone)]
10#[allow(non_snake_case)]
11pub struct AssemblyStats {
12    /// Number of contigs.
13    pub n_contigs: usize,
14    /// Total assembly length in bases.
15    pub total_length: usize,
16    /// Length of the largest contig.
17    pub largest_contig: usize,
18    /// Length of the smallest contig.
19    pub smallest_contig: usize,
20    /// GC content as a fraction (0.0–1.0), ignoring N bases.
21    pub gc_content: f64,
22    /// N50: the length such that contigs of this length or longer cover ≥ 50% of the assembly.
23    pub n50: usize,
24    /// L50: the number of contigs needed to reach N50 cumulative length.
25    pub l50: usize,
26    /// N90: the length such that contigs of this length or longer cover ≥ 90% of the assembly.
27    pub n90: usize,
28    /// L90: the number of contigs needed to reach N90 cumulative length.
29    pub l90: usize,
30    /// Area under the Nx curve (auN = Σ length² / total_length).
31    #[allow(non_snake_case)]
32    pub auN: f64,
33}
34
35/// Compute assembly statistics for a set of contigs.
36///
37/// # Errors
38///
39/// Returns an error if `contigs` is empty.
40pub fn assembly_stats(contigs: &[&[u8]]) -> Result<AssemblyStats> {
41    if contigs.is_empty() {
42        return Err(CyaneaError::InvalidInput(
43            "at least one contig is required".into(),
44        ));
45    }
46
47    let mut lengths: Vec<usize> = contigs.iter().map(|c| c.len()).collect();
48    lengths.sort_unstable_by(|a, b| b.cmp(a)); // descending
49
50    let total_length: usize = lengths.iter().sum();
51    let largest_contig = lengths[0];
52    let smallest_contig = *lengths.last().unwrap();
53    let n_contigs = lengths.len();
54
55    // GC content: count G+C over all non-N bases.
56    let mut gc = 0usize;
57    let mut non_n = 0usize;
58    for contig in contigs {
59        for &b in *contig {
60            let upper = b.to_ascii_uppercase();
61            match upper {
62                b'G' | b'C' => {
63                    gc += 1;
64                    non_n += 1;
65                }
66                b'A' | b'T' => {
67                    non_n += 1;
68                }
69                _ => {} // N and others ignored
70            }
71        }
72    }
73    let gc_content = if non_n > 0 {
74        gc as f64 / non_n as f64
75    } else {
76        0.0
77    };
78
79    let (n50, l50) = nx_from_sorted(&lengths, total_length, 0.5);
80    let (n90, l90) = nx_from_sorted(&lengths, total_length, 0.9);
81
82    // auN = Σ length² / total_length
83    #[allow(non_snake_case)]
84    let auN: f64 = lengths.iter().map(|&l| (l as f64).powi(2)).sum::<f64>() / total_length as f64;
85
86    Ok(AssemblyStats {
87        n_contigs,
88        total_length,
89        largest_contig,
90        smallest_contig,
91        gc_content,
92        n50,
93        l50,
94        n90,
95        l90,
96        auN,
97    })
98}
99
100/// Compute Nx and Lx values for a given fraction `x` (0.0–1.0).
101///
102/// Nx is the contig length at which cumulative length first reaches x × total.
103/// Lx is the number of contigs required to reach that threshold.
104///
105/// # Errors
106///
107/// Returns an error if `contigs` is empty or `x` is not in (0, 1].
108pub fn nx_values(contigs: &[&[u8]], x: f64) -> Result<(usize, usize)> {
109    if contigs.is_empty() {
110        return Err(CyaneaError::InvalidInput(
111            "at least one contig is required".into(),
112        ));
113    }
114    if x <= 0.0 || x > 1.0 {
115        return Err(CyaneaError::InvalidInput(format!(
116            "x must be in (0, 1], got {}",
117            x
118        )));
119    }
120
121    let mut lengths: Vec<usize> = contigs.iter().map(|c| c.len()).collect();
122    lengths.sort_unstable_by(|a, b| b.cmp(a));
123    let total: usize = lengths.iter().sum();
124
125    Ok(nx_from_sorted(&lengths, total, x))
126}
127
128/// Internal: compute Nx/Lx from pre-sorted (descending) lengths.
129fn nx_from_sorted(lengths: &[usize], total: usize, x: f64) -> (usize, usize) {
130    let threshold = (total as f64 * x).ceil() as usize;
131    let mut cumulative = 0usize;
132    for (i, &len) in lengths.iter().enumerate() {
133        cumulative += len;
134        if cumulative >= threshold {
135            return (len, i + 1);
136        }
137    }
138    // Should not reach here if lengths sum to total.
139    (*lengths.last().unwrap(), lengths.len())
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    #[test]
147    fn assembly_stats_simple() {
148        let contigs: Vec<&[u8]> = vec![b"ACGTACGT", b"ACGT", b"AC"];
149        let stats = assembly_stats(&contigs).unwrap();
150        assert_eq!(stats.n_contigs, 3);
151        assert_eq!(stats.total_length, 14);
152        assert_eq!(stats.largest_contig, 8);
153        assert_eq!(stats.smallest_contig, 2);
154    }
155
156    #[test]
157    fn n50_known_values() {
158        // Contigs: 10, 8, 6, 4, 2 → total 30, threshold = 15
159        // cumulative: 10 → 18 ≥ 15, so N50=8, wait let's re-check:
160        // sorted desc: 10, 8, 6, 4, 2. cumulative after 10: 10 < 15. after 10+8=18 ≥ 15. N50=8, L50=2.
161        let c1 = vec![b'A'; 10];
162        let c2 = vec![b'A'; 8];
163        let c3 = vec![b'A'; 6];
164        let c4 = vec![b'A'; 4];
165        let c5 = vec![b'A'; 2];
166        let contigs: Vec<&[u8]> = vec![&c1, &c2, &c3, &c4, &c5];
167        let (n50, l50) = nx_values(&contigs, 0.5).unwrap();
168        assert_eq!(n50, 8);
169        assert_eq!(l50, 2);
170    }
171
172    #[test]
173    fn gc_content_correct() {
174        let contigs: Vec<&[u8]> = vec![b"GGCC", b"AATT"];
175        let stats = assembly_stats(&contigs).unwrap();
176        assert!((stats.gc_content - 0.5).abs() < 1e-10);
177    }
178
179    #[test]
180    fn empty_contigs_error() {
181        let contigs: Vec<&[u8]> = vec![];
182        assert!(assembly_stats(&contigs).is_err());
183    }
184}