illumina_coordinates/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
//! # illumina_coordinates
//!
//! This crate provides a single function to parse sequence identifiers from FASTQ files created
//! by Illumina sequencers. Sequence identifiers contain information about each read, including
//! the physical location of the DNA cluster on the flow cell surface that contained the
//! associated sequence.
//!
//! Illumina was not involved in the creation of this library in any way.
#![crate_type="lib"]
#![deny(warnings, missing_docs)]
use std::convert::From;
use std::result::Result;
use std::num;
#[derive(Debug, PartialOrd, PartialEq)]
/// Sample numbers are either the number from the sample sheet or a sequence if the read was from
/// the Undetermined Reads
pub enum Sample {
/// Sample number
Number(u8),
/// Sequence from Undetermined Reads
Sequence(String)
}
/// A parsed sequence identifier
pub struct SequenceIdentifier {
/// ID of the sequencing machine
pub sequencer_id: String,
/// The number of sequencing runs this machine has performed
pub run_count: u16,
/// ID of the flow cell, printed on the side of the glass slide
pub flow_cell_id: String,
/// Lane number. For MiSeqs, this is always 1
pub lane: u8,
/// The near or far side off the flow cell surface
pub side: u8,
/// The row within a lane, if wide enough. For MiSeqs, this is always 1
pub swath: u8,
/// The positional order of the region where the cluster is located
pub tile: u8,
/// The x-coordinate of the cluster
pub x: u16,
/// The y-coordinate of the cluster
pub y: u16,
/// The read number
pub read: u8,
/// Whether the read was filtered for low quality (Y=filtered)
pub is_filtered: bool,
/// Indicates the type of control, 0 = not a control read
pub control_number: u8,
/// Number from sample sheet, or the sequence if the read is in Undetermined Reads
pub sample: Sample
}
#[derive(Debug)]
/// Errors encountered when parsing FASTQ files
pub enum IlluminaError {
/// We expected an integer but did not find one
ParseError,
/// The line was not structured as expected
SplitError
}
impl From<num::ParseIntError> for IlluminaError {
fn from(_: num::ParseIntError) -> IlluminaError {
IlluminaError::ParseError
}
}
/// Parses location information from an Illumina sequence identifier. This implementation is
/// about 3x faster than using a regular expression.
///
/// The fields in the example identifier below have the following meaning:
/// @M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0
///
/// M03745 ID of the sequencing machine
///
/// 11 run count for this machine
///
/// 000000000-B54L5 ID of the flow cell. "B54L5" will be printed on the flow cell in this example
///
/// 1 lane number. For MiSeqs, there's only one lane
///
/// 2108 the first digit is the side of the chip
/// the second digit is the swath. For MiSeqs, this is always 1. For HiSeqs, each lane is two tiles
/// wide, and the first pass from left-to-right is swath one, then the returning pass on the other
/// side of the lane is swath two
/// the last two digits are the order of the tile. For MiSeqs, this is a number from 1 to 19
///
/// 4127 the x-position of the read in the tile, in arbitrary units
///
/// 8949 the y-position of the read in the tile, in arbitrary units
///
/// 1 First (forward) read in a paired-end run
///
/// N Read was not filtered (sufficient quality)
///
/// 0 This was not a control
///
/// 0 This was the first sample on the sample sheet
///
/// See https://help.basespace.illumina.com/articles/descriptive/fastq-files/ for more information.
///
/// # Example
///
/// ```rust
/// extern crate illumina_coordinates;
/// use illumina_coordinates::Sample;
///
/// fn main() {
/// let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0";
/// let seq_id = illumina_coordinates::parse_sequence_identifier(&line).unwrap();
/// assert_eq!(seq_id.sequencer_id, "M03745".to_string());
/// assert_eq!(seq_id.run_count, 11);
/// assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
/// assert_eq!(seq_id.lane, 1);
/// assert_eq!(seq_id.side, 2);
/// assert_eq!(seq_id.swath, 1);
/// assert_eq!(seq_id.tile, 8);
/// assert_eq!(seq_id.x, 4127);
/// assert_eq!(seq_id.y, 8949);
/// assert_eq!(seq_id.read, 1);
/// assert_eq!(seq_id.is_filtered, false);
/// assert_eq!(seq_id.control_number, 0);
/// assert_eq!(seq_id.sample, Sample::Number(0));
/// }
/// ```
pub fn parse_sequence_identifier(text: &str) -> Result<SequenceIdentifier, IlluminaError> {
let halves: Vec<&str> = text.trim().split(' ').collect();
if halves.len() != 2 {
return Err(IlluminaError::SplitError)
}
let left: Vec<&str> = halves[0].split(':').collect();
let right: Vec<&str> = halves[1].split(':').collect();
if left.len() != 7 {
return Err(IlluminaError::SplitError);
}
if right.len() != 4 {
return Err(IlluminaError::SplitError);
}
let sequencer_id = left[0].split_at(1).1.to_string();
let run_count = left[1].parse::<u16>()?;
let flow_cell_id = left[2].to_string();
let lane = left[3].parse::<u8>()?;
let (side, remainder) = left[4].split_at(1);
let (swath, tile) = remainder.split_at(1);
let side = side.parse::<u8>()?;
let swath = swath.parse::<u8>()?;
let tile = tile.parse::<u8>()?;
let x = left[5].parse::<u16>()?;
let y = left[6].parse::<u16>()?;
let read = right[0].parse::<u8>()?;
let is_filtered = match right[1] {
"Y" => true,
"N" => false,
_ => return Err(IlluminaError::ParseError)
};
let control_number= right[2].parse::<u8>()?;
let sample = right[3].parse::<u8>();
let sample = match sample {
Ok(n) => Sample::Number(n),
Err(_) => Sample::Sequence(String::from(right[3]))
};
Ok(SequenceIdentifier {
sequencer_id,
run_count,
flow_cell_id,
lane,
side,
swath,
tile,
x,
y,
read,
is_filtered,
control_number,
sample
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse() {
let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:N:0:0";
let seq_id = parse_sequence_identifier(&line).unwrap();
assert_eq!(seq_id.sequencer_id, "M03745".to_string());
assert_eq!(seq_id.run_count, 11);
assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
assert_eq!(seq_id.lane, 1);
assert_eq!(seq_id.side, 2);
assert_eq!(seq_id.swath, 1);
assert_eq!(seq_id.tile, 8);
assert_eq!(seq_id.x, 4127);
assert_eq!(seq_id.y, 8949);
assert_eq!(seq_id.read, 1);
assert_eq!(seq_id.is_filtered, false);
assert_eq!(seq_id.control_number, 0);
assert_eq!(seq_id.sample, Sample::Number(0));
}
#[test]
fn test_parse_with_newline() {
let line = "@M03745:11:000000000-B54L5:1:2108:4127:8949 1:Y:0:0\n";
let seq_id = parse_sequence_identifier(&line).unwrap();
assert_eq!(seq_id.sequencer_id, "M03745".to_string());
assert_eq!(seq_id.run_count, 11);
assert_eq!(seq_id.flow_cell_id, "000000000-B54L5".to_string());
assert_eq!(seq_id.lane, 1);
assert_eq!(seq_id.side, 2);
assert_eq!(seq_id.swath, 1);
assert_eq!(seq_id.tile, 8);
assert_eq!(seq_id.x, 4127);
assert_eq!(seq_id.y, 8949);
assert_eq!(seq_id.read, 1);
assert_eq!(seq_id.is_filtered, true);
assert_eq!(seq_id.control_number, 0);
assert_eq!(seq_id.sample, Sample::Number(0));
}
#[test]
fn test_parse_error() {
let result = parse_sequence_identifier("CACGACGACTAGCTACGGACGCGGCACGACGCAG");
assert!(result.is_err());
}
}