lightmotif_io/transfac/
mod.rs

1//! Parser implementation for the TRANSFAC format.
2//!
3//! The TRANSFAC matrix format is similar to the EMBL sequence format,
4//! using a 2-letter header before each row that is used for metadata.
5//! The matrix usually contains counts, but they may be in floating-point
6//! format if they were rescaled.
7//!
8//! ```text
9//! AC  M00005
10//! XX
11//! DT  19.10.1992 (created); ewi.
12//! CO  Copyright (C), Biobase GmbH.
13//! XX
14//! P0      A      C      G      T
15//! 01      3      0      0      2      W
16//! 02      1      1      3      0      G
17//! 03      3      1      1      0      A
18//! 04      2      1      2      0      R
19//! 05      1      2      0      2      Y
20//! 06      0      5      0      0      C
21//! 07      5      0      0      0      A
22//! XX
23//! ```
24//!
25//! The parser implemented in this module is not complete, and only supports
26//! the following metadata:
27//!
28//! - `ID`: identifier
29//! - `AC`: accession
30//! - `NA`: name
31//! - `DE`: description
32//! - `DT`: date (creation or update)
33//! - `RE`: references (similar to EMBL in format)
34//! - `BS`: binding sites
35//! - `P0`: matrix.
36//!
37
38use std::io::BufRead;
39
40use lightmotif::abc::Alphabet;
41use lightmotif::abc::Pseudocounts;
42use lightmotif::dense::DenseMatrix;
43use lightmotif::pwm::CountMatrix;
44use lightmotif::pwm::FrequencyMatrix;
45
46mod parse;
47mod reader;
48
49pub use self::reader::Reader;
50
51/// A TRANSFAC record.
52#[derive(Debug, Clone)]
53pub struct Record<A: Alphabet> {
54    id: Option<String>,
55    accession: Option<String>,
56    name: Option<String>,
57    description: Option<String>,
58    data: Option<DenseMatrix<f32, A::K>>,
59    dates: Vec<Date>,
60    references: Vec<Reference>,
61    sites: Vec<String>,
62}
63
64impl<A: Alphabet> Record<A> {
65    /// The identifier of the record, if any.
66    pub fn id(&self) -> Option<&str> {
67        self.id.as_deref()
68    }
69
70    /// The accession of the record, if any.
71    pub fn accession(&self) -> Option<&str> {
72        self.accession.as_deref()
73    }
74
75    /// The name of the record, if any.
76    pub fn name(&self) -> Option<&str> {
77        self.name.as_deref()
78    }
79
80    /// The description of the record, if any.
81    pub fn description(&self) -> Option<&str> {
82        self.description.as_deref()
83    }
84
85    /// The raw data found in the matrix.
86    pub fn data(&self) -> Option<&DenseMatrix<f32, A::K>> {
87        self.data.as_ref()
88    }
89
90    /// The references associated with the record.
91    pub fn references(&self) -> &[Reference] {
92        &self.references
93    }
94
95    /// Get the record matrix as an integer count data.
96    pub fn to_counts(&self) -> Option<CountMatrix<A>> {
97        if let Some(data) = &self.data {
98            let mut counts = DenseMatrix::<u32, A::K>::new(data.rows());
99            for (i, row) in data.iter().enumerate() {
100                for (j, &x) in row.iter().enumerate() {
101                    // check the matrix contains count data
102                    if x.round() != x {
103                        return None;
104                    }
105                    counts[i][j] = x.round() as u32
106                }
107            }
108            CountMatrix::new(counts).ok()
109        } else {
110            None
111        }
112    }
113
114    /// Get the record matrix as a frequency matrix.
115    pub fn to_freq<P>(&self, pseudo: P) -> Option<FrequencyMatrix<A>>
116    where
117        P: Into<Pseudocounts<A>>,
118    {
119        if let Some(data) = &self.data {
120            let p = pseudo.into();
121            let mut probas = DenseMatrix::<f32, A::K>::new(data.rows());
122            for (i, row) in data.iter().enumerate() {
123                let src = &data[i];
124                let dst = &mut probas[i];
125                for (j, &x) in row.iter().enumerate() {
126                    dst[j] = x + p.counts()[j];
127                }
128                let s: f32 = dst.iter().sum();
129                for x in dst.iter_mut() {
130                    *x /= s;
131                }
132            }
133            FrequencyMatrix::new(probas).ok()
134        } else {
135            None
136        }
137    }
138}
139
140#[derive(Debug, Clone, Copy, PartialEq, Eq)]
141pub enum DateKind {
142    Created,
143    Updated,
144}
145
146#[derive(Debug, Clone)]
147pub struct Date {
148    kind: DateKind,
149    author: String,
150    day: u8,
151    month: u8,
152    year: u16,
153}
154
155#[derive(Clone, Debug)]
156pub struct ReferenceNumber {
157    local: u32,
158    xref: Option<String>,
159}
160
161impl ReferenceNumber {
162    /// Create a new reference number with the given number.
163    pub fn new(local: u32) -> Self {
164        Self::with_xref(local, None)
165    }
166
167    /// Create a new reference number with the given number and cross-reference.
168    pub fn with_xref<X>(local: u32, xref: X) -> Self
169    where
170        X: Into<Option<String>>,
171    {
172        Self {
173            local,
174            xref: xref.into(),
175        }
176    }
177
178    /// The local number of the reference number.
179    pub fn local(&self) -> u32 {
180        self.local
181    }
182
183    /// The cross-reference, if any.
184    pub fn xref(&self) -> Option<&str> {
185        self.xref.as_ref().map(String::as_str)
186    }
187}
188
189#[derive(Clone, Debug)]
190pub struct Reference {
191    number: ReferenceNumber,
192    // authors: String,
193    title: Option<String>,
194    link: Option<String>,
195    pmid: Option<String>,
196}
197
198impl Reference {
199    /// Create a new reference with the given reference number.
200    pub fn new(number: ReferenceNumber) -> Self {
201        Self {
202            number,
203            title: None,
204            link: None,
205            pmid: None,
206        }
207    }
208
209    /// The number of the reference.
210    pub fn number(&self) -> &ReferenceNumber {
211        &self.number
212    }
213
214    /// The title of the reference, if any.
215    pub fn title(&self) -> Option<&str> {
216        self.title.as_ref().map(String::as_str)
217    }
218
219    /// A link to the reference, if any.
220    pub fn link(&self) -> Option<&str> {
221        self.link.as_ref().map(String::as_str)
222    }
223
224    /// The PubMed ID of the reference, if any.
225    pub fn pmid(&self) -> Option<&str> {
226        self.pmid.as_ref().map(String::as_str)
227    }
228}
229
230pub fn read<B: BufRead, A: Alphabet>(reader: B) -> self::reader::Reader<B, A> {
231    self::reader::Reader::new(reader)
232}