gecco 0.5.3

Gene Cluster prediction with Conditional Random Fields
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! Data layer types for gene cluster detection.

use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::fmt;

use serde::{Deserialize, Serialize};

use crate::interpro::GOTerm;

// ---------------------------------------------------------------------------
// ClusterType
// ---------------------------------------------------------------------------

/// An immutable storage for the type of a gene cluster.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ClusterType {
    pub names: BTreeSet<String>,
}

impl ClusterType {
    pub fn new(names: impl IntoIterator<Item = impl Into<String>>) -> Self {
        Self {
            names: names.into_iter().map(Into::into).collect(),
        }
    }

    pub fn unknown() -> Self {
        Self {
            names: BTreeSet::new(),
        }
    }

    pub fn is_unknown(&self) -> bool {
        self.names.is_empty()
    }

    /// Unpack a composite type into individual single types.
    pub fn unpack(&self) -> Vec<ClusterType> {
        self.names
            .iter()
            .map(|n| ClusterType::new(std::iter::once(n.clone())))
            .collect()
    }
}

impl fmt::Display for ClusterType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.names.is_empty() {
            write!(f, "Unknown")
        } else {
            let mut first = true;
            for name in &self.names {
                if !first {
                    write!(f, ";")?;
                }
                write!(f, "{}", name)?;
                first = false;
            }
            Ok(())
        }
    }
}

// ---------------------------------------------------------------------------
// Strand
// ---------------------------------------------------------------------------

/// DNA strand on which a gene is located.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Strand {
    Coding,
    Reverse,
}

impl Strand {
    pub fn sign(&self) -> &'static str {
        match self {
            Strand::Coding => "+",
            Strand::Reverse => "-",
        }
    }

    pub fn from_sign(s: &str) -> Option<Self> {
        match s {
            "+" => Some(Strand::Coding),
            "-" => Some(Strand::Reverse),
            _ => None,
        }
    }

    pub fn as_int(&self) -> i8 {
        match self {
            Strand::Coding => 1,
            Strand::Reverse => -1,
        }
    }
}

// ---------------------------------------------------------------------------
// Domain
// ---------------------------------------------------------------------------

/// A conserved region within a protein.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Domain {
    pub name: String,
    /// Start coordinate within protein (1-based, inclusive).
    pub start: i64,
    /// End coordinate within protein (inclusive).
    pub end: i64,
    /// HMM library name (e.g. "Pfam").
    pub hmm: String,
    /// Independent e-value from hmmsearch.
    pub i_evalue: f64,
    /// P-value from hmmsearch.
    pub pvalue: f64,
    /// Probability of being part of a gene cluster (None if not yet predicted).
    pub probability: Option<f64>,
    /// Cluster weight from CRF state features.
    pub cluster_weight: Option<f64>,
    pub go_terms: Vec<GOTerm>,
    pub go_functions: Vec<GOTerm>,
    pub qualifiers: BTreeMap<String, Vec<String>>,
}

impl Domain {
    pub fn with_probability(&self, probability: Option<f64>) -> Domain {
        let mut d = self.clone();
        d.probability = probability;
        d
    }

    pub fn with_cluster_weight(&self, cluster_weight: Option<f64>) -> Domain {
        let mut d = self.clone();
        d.cluster_weight = cluster_weight;
        d
    }
}

// ---------------------------------------------------------------------------
// Protein
// ---------------------------------------------------------------------------

/// A sequence of amino-acids translated from a gene.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Protein {
    pub id: String,
    pub seq: String,
    pub domains: Vec<Domain>,
}

impl Protein {
    pub fn new(id: impl Into<String>, seq: impl Into<String>) -> Self {
        Self {
            id: id.into(),
            seq: seq.into(),
            domains: Vec::new(),
        }
    }

    pub fn with_seq(&self, seq: String) -> Protein {
        Protein {
            id: self.id.clone(),
            seq,
            domains: self.domains.clone(),
        }
    }

    pub fn with_domains(&self, domains: Vec<Domain>) -> Protein {
        Protein {
            id: self.id.clone(),
            seq: self.seq.clone(),
            domains,
        }
    }
}

// ---------------------------------------------------------------------------
// Gene
// ---------------------------------------------------------------------------

/// A nucleotide sequence coding a protein.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Gene {
    /// Source sequence identifier.
    pub source_id: String,
    /// Start coordinate within source (1-based, inclusive).
    pub start: i64,
    /// End coordinate within source (inclusive).
    pub end: i64,
    pub strand: Strand,
    pub protein: Protein,
    pub qualifiers: BTreeMap<String, Vec<String>>,
    /// Override probability (takes precedence over domain-derived probability).
    pub probability: Option<f64>,
}

impl Gene {
    pub fn id(&self) -> &str {
        &self.protein.id
    }

    /// Average of domain probabilities (or override if set).
    pub fn average_probability(&self) -> Option<f64> {
        if let Some(p) = self.probability {
            return Some(p);
        }
        let probas: Vec<f64> = self
            .protein
            .domains
            .iter()
            .filter_map(|d| d.probability)
            .collect();
        if probas.is_empty() {
            None
        } else {
            Some(statistics_mean(&probas))
        }
    }

    /// Highest domain probability (or override if set).
    pub fn maximum_probability(&self) -> Option<f64> {
        if let Some(p) = self.probability {
            return Some(p);
        }
        self.protein
            .domains
            .iter()
            .filter_map(|d| d.probability)
            .fold(None, |acc, p| Some(acc.map_or(p, |a: f64| a.max(p))))
    }

    /// Gene functions from GO term annotations.
    pub fn functions(&self) -> HashSet<String> {
        let mut fns: HashSet<String> = self
            .protein
            .domains
            .iter()
            .flat_map(|d| d.go_functions.iter().map(|t| t.name.clone()))
            .collect();
        if fns.is_empty() {
            fns.insert("unknown".to_string());
        }
        fns
    }

    pub fn with_protein(&self, protein: Protein) -> Gene {
        Gene {
            source_id: self.source_id.clone(),
            start: self.start,
            end: self.end,
            strand: self.strand,
            protein,
            qualifiers: self.qualifiers.clone(),
            probability: self.probability,
        }
    }

    pub fn with_probability(&self, probability: f64) -> Gene {
        let new_domains: Vec<Domain> = self
            .protein
            .domains
            .iter()
            .map(|d| d.with_probability(Some(probability)))
            .collect();
        Gene {
            source_id: self.source_id.clone(),
            start: self.start,
            end: self.end,
            strand: self.strand,
            protein: self.protein.with_domains(new_domains),
            qualifiers: self.qualifiers.clone(),
            probability: Some(probability),
        }
    }
}

fn statistics_mean(values: &[f64]) -> f64 {
    let mut partials = Vec::<f64>::new();
    for &value in values {
        let mut x = value;
        let mut next = Vec::with_capacity(partials.len() + 1);
        for &partial in &partials {
            let (hi, lo) = match x.abs().partial_cmp(&partial.abs()) {
                Some(Ordering::Less) => (partial, x),
                _ => (x, partial),
            };
            x = hi + lo;
            let yr = x - hi;
            let lo = lo - yr;
            if lo != 0.0 {
                next.push(lo);
            }
        }
        next.push(x);
        partials = next;
    }
    partials.iter().sum::<f64>() / values.len() as f64
}

// ---------------------------------------------------------------------------
// Cluster
// ---------------------------------------------------------------------------

/// A sequence of contiguous genes forming a putative BGC.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Cluster {
    pub id: String,
    pub genes: Vec<Gene>,
    pub cluster_type: Option<ClusterType>,
    pub type_probabilities: BTreeMap<String, f64>,
}

impl Cluster {
    pub fn new(id: impl Into<String>, genes: Vec<Gene>) -> Self {
        Self {
            id: id.into(),
            genes,
            cluster_type: None,
            type_probabilities: BTreeMap::new(),
        }
    }

    pub fn source_id(&self) -> &str {
        &self.genes[0].source_id
    }

    pub fn start(&self) -> i64 {
        self.genes.iter().map(|g| g.start).min().unwrap_or(0)
    }

    pub fn end(&self) -> i64 {
        self.genes.iter().map(|g| g.end).max().unwrap_or(0)
    }

    pub fn average_probability(&self) -> Option<f64> {
        let probas: Vec<f64> = self
            .genes
            .iter()
            .filter_map(|g| g.average_probability())
            .collect();
        if probas.is_empty() {
            None
        } else {
            Some(statistics_mean(&probas))
        }
    }

    pub fn maximum_probability(&self) -> Option<f64> {
        self.genes
            .iter()
            .filter_map(|g| g.maximum_probability())
            .fold(None, |acc, p| Some(acc.map_or(p, |a: f64| a.max(p))))
    }

    /// Compute weighted domain composition vector.
    pub fn domain_composition(
        &self,
        all_possible: Option<&[String]>,
        normalize: bool,
        minlog_weights: bool,
        use_pvalue: bool,
    ) -> Vec<f64> {
        // Collect all domains from cluster genes
        let domains: Vec<&Domain> = self
            .genes
            .iter()
            .flat_map(|g| g.protein.domains.iter())
            .collect();

        let names: Vec<&str> = domains.iter().map(|d| d.name.as_str()).collect();
        let weights: Vec<f64> = domains
            .iter()
            .map(|d| {
                let v = if use_pvalue { d.pvalue } else { d.i_evalue };
                if minlog_weights {
                    -v.log10()
                } else {
                    1.0 - v
                }
            })
            .collect();

        let unique_names: HashSet<&str> = names.iter().copied().collect();

        // Determine the set of all possible domain names
        let default_possible: Vec<String>;
        let all_possible_ref: &[String] = match all_possible {
            Some(ap) => ap,
            None => {
                let mut u: Vec<String> = unique_names.iter().map(|s| s.to_string()).collect();
                u.sort();
                u.dedup();
                default_possible = u;
                &default_possible
            }
        };

        let mut composition = vec![0.0f64; all_possible_ref.len()];
        for (i, dom) in all_possible_ref.iter().enumerate() {
            if unique_names.contains(dom.as_str()) {
                for (j, name) in names.iter().enumerate() {
                    if *name == dom.as_str() {
                        composition[i] += weights[j];
                    }
                }
            }
        }

        if normalize {
            let total: f64 = composition.iter().sum();
            if total > 0.0 {
                for v in &mut composition {
                    *v /= total;
                }
            }
        }

        composition
    }
}