atglib/refgene/
writer.rs

1use std::fmt::Write as FmtWrite;
2use std::fs::File;
3use std::io::{BufWriter, Write};
4use std::path::Path;
5
6use crate::models::{Transcript, TranscriptWrite};
7use crate::refgene::constants::*;
8use crate::utils::errors::ParseRefGeneError;
9use crate::utils::errors::ReadWriteError;
10
11/// Writes [`Transcript`]s into a `BufWriter`
12///
13/// # Examples
14///
15/// ```rust
16/// use std::io;
17/// use atglib::tests;;
18/// use atglib::refgene::Writer;
19/// use atglib::models::TranscriptWrite;
20///
21/// let transcripts = vec![tests::transcripts::standard_transcript()];
22///
23/// let output = Vec::new(); // substitute this with proper IO (io::stdout())
24/// let mut writer = Writer::new(output);
25/// writer.write_transcript_vec(&transcripts);
26///
27/// let written_output = String::from_utf8(writer.into_inner().unwrap()).unwrap();
28/// assert_eq!(written_output.starts_with("0\tTest-Transcript\tchr1\t"), true);
29/// ```
30pub struct Writer<W: std::io::Write> {
31    inner: BufWriter<W>,
32}
33
34impl Writer<File> {
35    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
36        match File::create(path.as_ref()) {
37            Ok(file) => Ok(Self::new(file)),
38            Err(err) => Err(ReadWriteError::new(err)),
39        }
40    }
41}
42
43impl<W: std::io::Write> Writer<W> {
44    /// Creates a new generic Writer for any `std::io::Read`` object
45    ///
46    /// Use this method when you want to write to stdout or
47    /// a remote source, e.g. via HTTP
48    pub fn new(writer: W) -> Self {
49        Writer {
50            inner: BufWriter::new(writer),
51        }
52    }
53
54    pub fn with_capacity(capacity: usize, writer: W) -> Self {
55        Writer {
56            inner: BufWriter::with_capacity(capacity, writer),
57        }
58    }
59
60    pub fn flush(&mut self) -> Result<(), ParseRefGeneError> {
61        match self.inner.flush() {
62            Ok(res) => Ok(res),
63            Err(err) => Err(ParseRefGeneError::from(err.to_string())),
64        }
65    }
66
67    pub fn into_inner(self) -> Result<W, ParseRefGeneError> {
68        match self.inner.into_inner() {
69            Ok(res) => Ok(res),
70            Err(err) => Err(ParseRefGeneError::from(err.to_string())),
71        }
72    }
73}
74
75impl<W: std::io::Write> TranscriptWrite for Writer<W> {
76    /// Writes a single transcript formatted as RefGene with an extra newline
77    ///
78    /// This method adds an extra newline at the end of the row
79    /// to allow writing multiple transcripts continuosly
80    fn writeln_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
81        self.write_single_transcript(transcript)?;
82        self.inner.write_all("\n".as_bytes())
83    }
84
85    /// Writes a single transcript formatted as RefGene
86    ///
87    /// Consider [`writeln_single_transcript`](Writer::writeln_single_transcript)
88    /// to ensure that an extra newline is added to the output
89    fn write_single_transcript(&mut self, transcript: &Transcript) -> Result<(), std::io::Error> {
90        let columns: Vec<String> = Vec::from(transcript);
91        self.inner.write_all((columns.join("\t")).as_bytes())
92    }
93}
94
95impl From<&Transcript> for Vec<String> {
96    fn from(transcript: &Transcript) -> Self {
97        // RefGene is a tab-delimited format, each line has 16 columns
98        // Defines a Vector that contains the strings for each column
99        let mut columns: Vec<String> = vec!["".to_string(); N_REFGENE_COLUMNS];
100
101        columns[TRANSCRIPT_COL] = transcript.name().to_string();
102        columns[CHROMOSOME_COL] = transcript.chrom().to_string();
103        columns[STRAND_COL] = transcript.strand().to_string();
104        // RefGene handled start coordinates differently, so we substract 1.
105        // See the comments in `instantiate_exons`.
106        columns[TX_START_COL] = (transcript.tx_start() - 1).to_string();
107        columns[TX_END_COL] = transcript.tx_end().to_string();
108        columns[GENE_SYMBOL_COL] = transcript.gene().to_string();
109        columns[CDS_START_STAT_COL] = transcript.cds_start_stat().to_string();
110        columns[CDS_END_STAT_COL] = transcript.cds_end_stat().to_string();
111        columns[EXON_COUNT_COL] = transcript.exon_count().to_string();
112
113        // The bin value is not always present, defaulting to 0
114        columns[BIN_COL] = match transcript.bin() {
115            Some(x) => x.to_string(),
116            _ => "0".to_string(),
117        };
118
119        // The score value is not always present, default to 0
120        columns[SCORE_COL] = match transcript.score() {
121            Some(x) => x.to_string(),
122            _ => "0".to_string(),
123        };
124
125        // If the transcript does not have a CDS
126        // the CDS-start and CDS-end values are set to txEnd
127        columns[CDS_START_COL] = match transcript.cds_start() {
128            // RefGene handled start coordinates differently, so we substract 1.
129            // See the comments in `instantiate_exons`.
130            Some(x) => (x - 1).to_string(),
131            _ => transcript.tx_end().to_string(),
132        };
133        columns[CDS_END_COL] = match transcript.cds_end() {
134            Some(x) => x.to_string(),
135            _ => transcript.tx_end().to_string(),
136        };
137
138        // Don't ask, but some reason the refGene specs indicate that
139        // there is also a trailing comma after the list of exons
140        // e.g.: `12227,12721,14409,`
141
142        columns[EXON_STARTS_COL] = transcript.exons().iter().fold(
143            // Assuming that most genomic positions are 8-9 digits, plus the comma
144            String::with_capacity(transcript.exon_count() * 10),
145            |mut output, exon| {
146                // RefGene handled start coordinates differently, so we substract 1.
147                // See the comments in `instantiate_exons`.
148                let _ = write!(output, "{},", exon.start() - 1);
149                output
150            },
151        );
152
153        columns[EXON_ENDS_COL] = transcript.exons().iter().fold(
154            // Assuming that most genomic positions are 8-9 digits, plus the comma
155            String::with_capacity(transcript.exon_count() * 10),
156            |mut output, exon| {
157                let _ = write!(output, "{},", exon.end());
158                output
159            },
160        );
161
162        columns[EXON_FRAMES_COL] = transcript.exons().iter().fold(
163            // Assuming that most genomic positions are 8-9 digits, plus the comma
164            String::with_capacity(transcript.exon_count() * 10),
165            |mut output, exon| {
166                let _ = write!(output, "{},", exon.frame_offset().to_refgene());
167                output
168            },
169        );
170
171        columns
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::Writer;
178    use crate::models::{TranscriptRead, TranscriptWrite};
179    use crate::refgene::Reader;
180    use crate::tests::transcripts;
181
182    #[test]
183    fn test_nm_001365057() {
184        let transcripts = vec![transcripts::nm_001365057()];
185        let mut writer = Writer::new(Vec::new());
186        let _ = writer.write_transcript_vec(&transcripts);
187
188        let output = writer.into_inner().unwrap();
189
190        assert!(output.len() > 10);
191
192        // Since it's a bit too complicated to compare GTF file directly
193        // this tests re-parses the GTF data back into a Transcript
194        // and compares this
195        let mut reader = Reader::new(&*output);
196        let read_transcripts = reader.transcripts().unwrap();
197
198        assert_eq!(
199            read_transcripts.by_name("NM_001365057.2")[0],
200            &transcripts[0]
201        );
202    }
203
204    #[test]
205    fn test_nm_001365408() {
206        let transcripts = vec![transcripts::nm_001365408()];
207        let mut writer = Writer::new(Vec::new());
208        let _ = writer.write_transcript_vec(&transcripts);
209
210        let output = writer.into_inner().unwrap();
211
212        assert!(output.len() > 10);
213
214        // Since it's a bit too complicated to compare GTF file directly
215        // this tests re-parses the GTF data back into a Transcript
216        // and compares this
217        let mut reader = Reader::new(&*output);
218        let read_transcripts = reader.transcripts().unwrap();
219
220        assert_eq!(
221            read_transcripts.by_name("NM_001365408.1")[0],
222            &transcripts[0]
223        );
224    }
225
226    #[test]
227    fn test_nm_001371720() {
228        let transcripts = vec![transcripts::nm_001371720(false)];
229        let mut writer = Writer::new(Vec::new());
230        let _ = writer.write_transcript_vec(&transcripts);
231
232        let output = writer.into_inner().unwrap();
233
234        assert!(output.len() > 10);
235
236        // Since it's a bit too complicated to compare GTF file directly
237        // this tests re-parses the GTF data back into a Transcript
238        // and compares this
239        let mut reader = Reader::new(&*output);
240        let read_transcripts = reader.transcripts().unwrap();
241
242        assert_eq!(
243            read_transcripts.by_name("NM_001371720.1")[0],
244            &transcripts[0]
245        );
246    }
247
248    #[test]
249    fn test_nm_201550() {
250        let transcripts = vec![transcripts::nm_201550()];
251        let mut writer = Writer::new(Vec::new());
252        let _ = writer.write_transcript_vec(&transcripts);
253
254        let output = writer.into_inner().unwrap();
255
256        assert!(output.len() > 10);
257
258        // Since it's a bit too complicated to compare GTF file directly
259        // this tests re-parses the GTF data back into a Transcript
260        // and compares this
261        let mut reader = Reader::new(&*output);
262        let read_transcripts = reader.transcripts().unwrap();
263
264        assert_eq!(read_transcripts.by_name("NM_201550.4")[0], &transcripts[0]);
265    }
266}