1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
use std::convert::{TryFrom, TryInto};
use std::path::Path;

use tabfile::Tabfile;

use crate::cds::CDS;
use crate::error::{FileError, ParseError};
use crate::interval::Interval;

#[derive(Debug, PartialEq)]
pub struct SeqAnnotation {
    pub name: String,               // col 0
    pub chr: String,                // col 1
    pub range: Interval,            // col 2
    pub strand: Strand,             // col 3
    pub exons: Vec<Interval>,       // col 4
    pub coding_sequences: Vec<CDS>, // col 5+6
}

impl SeqAnnotation {
    pub fn new(
        name: String,
        chr: String,
        range: Interval,
        strand: Strand,
        exons: Vec<Interval>,
        coding_sequences: Vec<CDS>,
    ) -> Self {
        Self {
            name,
            chr,
            range,
            strand,
            exons,
            coding_sequences,
        }
    }

    /// Find introns based on the exons
    /// Assumption: Every intron is flanked by exons
    ///
    /// Cases:  a    b    c    d    e
    /// exons   |  --|--  |  --|--  |
    /// DNA  --------------------------
    ///
    /// a) Not an intron, because it is before the first exon
    /// b,d) Not an intron, because it is inside an exon
    /// c) An intron, because it is between two exons
    /// e) 3' UTR region. Not an intron.
    ///
    pub fn find_intron(&self, genomic_position: usize) -> Option<Interval> {
        let mut left_exon_end = 0;
        for (i, exon) in self.exons.iter().enumerate() {
            if genomic_position < exon.start {
                if i > 0 {
                    // case c
                    return Some(Interval::new(left_exon_end, exon.start).expect("start<stop"));
                } else {
                    // case a
                    return None;
                }
            } else if exon.contains(genomic_position) {
                return None; // cases b and d
            } // the position is right of the current exon
            left_exon_end = exon.stop;
        }
        None // case e
    }

    /// Determine all intron locations based on the exons
    ///
    /// Note that exons may overlap. But in any case, this function only returns
    /// regions that are not covered by any exons (assuming that the exons are
    /// sorted by start and then by stop position).
    pub fn get_introns(&self) -> Vec<Interval> {
        let mut result = Vec::new();
        let mut left_exon_end = 0;
        for (i, exon) in self.exons.iter().enumerate() {
            if i > 0 && left_exon_end < exon.start {
                // exons do not overlap. We can define a pure intron region
                result.push(
                    Interval::new(left_exon_end, exon.start).unwrap(), // pretty sure the exons are ordered
                )
            } // else: overlapping exons. We need to treat this as a super-exon region
            left_exon_end = exon.stop
        }
        result
    }

    pub fn get_end_minus50(&self) -> usize {
        let mut remain  = 50;
        match self.strand {
            Strand::Plus => {
                for cds in self.coding_sequences.iter().rev() {
                    //println!("plus {:?}", cds.range.stop - cds.range.start);
                    if cds.range.stop - cds.range.start > remain {
                        return cds.range.stop - remain;
                        //println!("end: {:?}", cds.range.stop - remain)
                    } else {
                        remain = remain - (cds.range.stop - cds.range.start)
                    }
                }
                self.range.start
            }
            Strand::Minus => {
                for cds in self.coding_sequences.iter() {
                    //println!("minus {:?}", cds.range.stop - cds.range.start);
                    if cds.range.stop - cds.range.start > remain {
                        return cds.range.start + remain;
                        //println!("end: {:?}", cds.range.start + remain)
                    } else {
                        remain = remain - (cds.range.stop - cds.range.start)
                    }
                }
                self.range.stop
            }
        }   
    }

    pub fn find_exon(&self, genomic_position: usize) -> Option<Interval> {
        for exon in &self.exons {
            if exon.contains(genomic_position) {
                return Some(*exon);
            }
        }
        None
    }

    pub fn find_cds(&self, genomic_position: usize) -> Option<CDS> {
        for cds in &self.coding_sequences {
            if cds.range.contains(genomic_position) {
                return Some(*cds);
            }
        }
        None
    }
}

#[derive(Debug, PartialEq, Eq)]
pub enum Strand {
    Plus,
    Minus,
}

impl std::fmt::Display for Strand {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}",
            match self {
                Self::Plus => "+",
                Self::Minus => "-",
            }
        )
    }
}

impl TryFrom<char> for Strand {
    type Error = ParseError;
    fn try_from(c: char) -> Result<Self, Self::Error> {
        match c {
            '+' => Ok(Strand::Plus),
            '-' => Ok(Strand::Minus),
            _ => Err(ParseError::somewhere("+ or -", c.to_string())),
        }
    }
}

pub fn read_sequence_annotations_from_file<P: AsRef<Path>>(
    path: P,
    filter_for_id: Option<&str>,
) -> Result<Vec<SeqAnnotation>, FileError> {
    const NAME_IDX: usize = 0;
    const CHR_IDX: usize = 1;
    const STRAND_IDX: usize = 2;
    const GENE_RANGE_IDX: usize = 3;
    const EXONS_IDX: usize = 4;
    const CDS_IDX: usize = 5;
    const CDS_PHASES_IDX: usize = 6;

    let mut result = Vec::new();
    let tabfile = match Tabfile::open(&path) {
        Ok(tf) => tf.comment_character('#'),
        Err(e) => return Err(FileError::io(Some(&path), e)),
    };
    for record_result in tabfile {
        let record = match record_result {
            Ok(record) => record,
            Err(e) => return Err(FileError::io(Some(&path), e)),
        };
        let tokens = record.fields();
        if tokens.len() < 7 {
            let err = ParseError::file(
                path.as_ref().to_path_buf(),
                record.line_number(),
                "7 columns",
                record.line().to_string(),
            );
            return Err(FileError::parse(Some(&path), err));
        }

        if let Some(id) = filter_for_id {
            if tokens[NAME_IDX] != id {
                continue;
            }
        } else {
            if tokens[NAME_IDX] == "gene_name" {
                continue //ignore header line
            }
        }

        let exons = Interval::parse_many(tokens[EXONS_IDX], ';')
            .map_err(|e| FileError::parse(Some(&path), e))?;
        let cds_intervals = Interval::parse_many(tokens[CDS_IDX], ';')
            .map_err(|e| FileError::parse(Some(&path), e))?;
        let cds_phases: Vec<&str> = tokens[CDS_PHASES_IDX]
            .split(';')
            .filter(|p| p != &"")
            .collect();
        if cds_intervals.len() != cds_phases.len() {
            return Err(FileError::parse(
                Some(&path),
                ParseError::file(
                    path.as_ref().to_path_buf(),
                    record.line_number(),
                    "number of CDS regions and CDS phases to be the same",
                    format!(
                        "#CDS_regions={}, #CDS_phases={}",
                        cds_intervals.len(),
                        cds_phases.len()
                    ),
                ),
            ));
        }
        let coding_sequences = {
            let mut cdss = Vec::new();
            for (interval, phase) in cds_intervals.iter().zip(cds_phases) {
                let phase = match phase.try_into() {
                    Ok(phase) => phase,
                    Err(e) => return Err(FileError::parse(Some(&path), e)),
                };
                cdss.push(CDS::new(*interval, phase));
            }
            cdss
        };
        let strand: char = tokens[STRAND_IDX].chars().next().unwrap();
        result.push(SeqAnnotation {
            name: tokens[NAME_IDX].to_string(),
            chr: tokens[CHR_IDX].to_string(),
            range: Interval::parse(tokens[GENE_RANGE_IDX])
                .map_err(|e| FileError::parse(Some(&path), e))?,
            strand: strand
                .try_into()
                .map_err(|e| FileError::parse(Some(&path), e))?,
            exons,
            coding_sequences,
        });
    }
    Ok(result)
}