1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
//! # illumina_coordinates
//!
//! This crate provides a single function to parse sequence identifiers from FASTQ files created
//! by Illumina sequencers. Sequence identifiers contain information about each read, including
//! the physical location of the DNA cluster on the flow cell surface that contained the
//! associated sequence.
//!
//! Illumina was not involved in the creation of this library in any way.

#![crate_type="lib"]
#![deny(warnings, missing_docs)]
use std::convert::From;
use std::result::Result;
use std::num;


#[derive(Debug, PartialOrd, PartialEq)]
/// Sample numbers are either the number from the sample sheet or a sequence if the read was from
/// the Undetermined Reads
pub enum Sample {
    /// Sample number
    Number(u8),
    /// Sequence from Undetermined Reads
    Sequence(String)
}

/// A parsed sequence identifier
pub struct SequenceIdentifier {
    /// ID of the sequencing machine
    pub sequencer_id: String,
    /// The number of sequencing runs this machine has performed
    pub run_count: u16,
    /// ID of the flow cell, printed on the side of the glass slide
    pub flow_cell_id: String,
    /// Lane number. For MiSeqs, this is always 1
    pub lane: u8,
    /// The near or far side off the flow cell surface
    pub side: u8,
    /// The row within a lane, if wide enough. For MiSeqs, this is always 1
    pub swath: u8,
    /// The positional order of the region where the cluster is located
    pub tile: u8,
    /// The x-coordinate of the cluster
    pub x: u16,
    /// The y-coordinate of the cluster
    pub y: u16,
    /// The read number
    pub read: u8,
    /// Whether the read was filtered for low quality (Y=filtered)
    pub is_filtered: bool,
    /// Indicates the type of control, 0 = not a control read
    pub control_number: u8,
    /// Number from sample sheet, or the sequence if the read is in Undetermined Reads
    pub sample: Sample
}

#[derive(Debug)]
/// Errors encountered when parsing FASTQ files
pub enum IlluminaError {
    /// We expected an integer but did not find one
    ParseError,
    /// The line was not structured as expected
    SplitError
}

impl From<num::ParseIntError> for IlluminaError {
    fn from(_: num::ParseIntError) -> IlluminaError {
        IlluminaError::ParseError
    }
}

/// Parses location information from an Illumina sequence identifier. This implementation is
/// about 3x faster than using a regular expression.
///
/// The fields in the example identifier below have the following meaning:
/// @M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0
///
/// M03745              ID of the sequencing machine
///
/// 11                  run count for this machine
///
/// 000000000-B54L5     ID of the flow cell. "B54L5" will be printed on the flow cell in this example
///
/// 1                   lane number. For MiSeqs, there's only one lane
///
/// 2108                the first digit is the side of the chip
///                     the second digit is the swath. For MiSeqs, this is always 1. For HiSeqs, each lane is two tiles
///                     wide, and the first pass from left-to-right is swath one, then the returning pass on the other
///                     side of the lane is swath two
///                     the last two digits are the order of the tile. For MiSeqs, this is a number from 1 to 19
///
/// 4127                the x-position of the read in the tile, in arbitrary units
///
/// 8949                the y-position of the read in the tile, in arbitrary units
///
/// 1                   First (forward) read in a paired-end run
///
/// N                   Read was not filtered (sufficient quality)
///
/// 0                   This was not a control
///
/// 0                   This was the first sample on the sample sheet
///
/// See https://help.basespace.illumina.com/articles/descriptive/fastq-files/ for more information.
///
/// # Example
///
/// ```rust
/// extern crate illumina_coordinates;
/// use illumina_coordinates::Sample;
///
/// fn main() {
///     let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0";
///     let seq_id = illumina_coordinates::parse_sequence_identifier(&line).unwrap();
///     assert_eq!(seq_id.sequencer_id, "M03745".to_string());
///     assert_eq!(seq_id.run_count, 11);
///     assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
///     assert_eq!(seq_id.lane, 1);
///     assert_eq!(seq_id.side, 2);
///     assert_eq!(seq_id.swath, 1);
///     assert_eq!(seq_id.tile, 8);
///     assert_eq!(seq_id.x, 4127);
///     assert_eq!(seq_id.y, 8949);
///     assert_eq!(seq_id.read, 1);
///     assert_eq!(seq_id.is_filtered, false);
///     assert_eq!(seq_id.control_number, 0);
///     assert_eq!(seq_id.sample, Sample::Number(0));
/// }
/// ```
pub fn parse_sequence_identifier(text: &str) -> Result<SequenceIdentifier, IlluminaError> {
    let halves: Vec<&str> = text.trim().split(' ').collect();
    if halves.len() != 2 {
        return Err(IlluminaError::SplitError)
    }
    let left: Vec<&str> = halves[0].split(':').collect();
    let right: Vec<&str> = halves[1].split(':').collect();
    if left.len() != 7 {
        return Err(IlluminaError::SplitError);
    }
    if right.len() != 4 {
        return Err(IlluminaError::SplitError);
    }
    let sequencer_id = left[0].split_at(1).1.to_string();
    let run_count = left[1].parse::<u16>()?;
    let flow_cell_id = left[2].to_string();
    let lane = left[3].parse::<u8>()?;
    let (side, remainder) = left[4].split_at(1);
    let (swath, tile) = remainder.split_at(1);
    let side = side.parse::<u8>()?;
    let swath = swath.parse::<u8>()?;
    let tile = tile.parse::<u8>()?;
    let x = left[5].parse::<u16>()?;
    let y = left[6].parse::<u16>()?;

    let read = right[0].parse::<u8>()?;
    let is_filtered = match right[1] {
        "Y" => true,
        "N" => false,
        _ => return Err(IlluminaError::ParseError)
    };
    let control_number= right[2].parse::<u8>()?;
    let sample = right[3].parse::<u8>();
    let sample = match sample {
        Ok(n) => Sample::Number(n),
        Err(_) => Sample::Sequence(String::from(right[3]))
    };

    Ok(SequenceIdentifier {
        sequencer_id,
        run_count,
        flow_cell_id,
        lane,
        side,
        swath,
        tile,
        x,
        y,
        read,
        is_filtered,
        control_number,
        sample
    })
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse() {
        let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0";
        let seq_id = parse_sequence_identifier(&line).unwrap();
        assert_eq!(seq_id.sequencer_id, "M03745".to_string());
        assert_eq!(seq_id.run_count, 11);
        assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
        assert_eq!(seq_id.lane, 1);
        assert_eq!(seq_id.side, 2);
        assert_eq!(seq_id.swath, 1);
        assert_eq!(seq_id.tile, 8);
        assert_eq!(seq_id.x, 4127);
        assert_eq!(seq_id.y, 8949);
        assert_eq!(seq_id.read, 1);
        assert_eq!(seq_id.is_filtered, false);
        assert_eq!(seq_id.control_number, 0);
        assert_eq!(seq_id.sample, Sample::Number(0));
    }
    
    #[test]
    fn test_parse_with_newline() {
        let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:Y:0:0\n";
        let seq_id = parse_sequence_identifier(&line).unwrap();
        assert_eq!(seq_id.sequencer_id, "M03745".to_string());
        assert_eq!(seq_id.run_count, 11);
        assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
        assert_eq!(seq_id.lane, 1);
        assert_eq!(seq_id.side, 2);
        assert_eq!(seq_id.swath, 1);
        assert_eq!(seq_id.tile, 8);
        assert_eq!(seq_id.x, 4127);
        assert_eq!(seq_id.y, 8949);
        assert_eq!(seq_id.read, 1);
        assert_eq!(seq_id.is_filtered, true);
        assert_eq!(seq_id.control_number, 0);
        assert_eq!(seq_id.sample, Sample::Number(0));
    }

    #[test]
    fn test_parse_error() {
        let result = parse_sequence_identifier("CACGACGACTAGCTACGGACGCGGCACGACGCAG");
        assert!(result.is_err());
    }
}